aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-01-31 05:18:43 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-02-17 13:21:55 +0000
commite28cf395b57a091d0850cd28cecc81046153b843 (patch)
tree13fbd43f586d08642f7cf0d9d6cfc2044f5fcd72
parenteca54a0cdbcdde64adee8f9be316a2eec6aa47e0 (diff)
downloadComputeLibrary-e28cf395b57a091d0850cd28cecc81046153b843.tar.gz
Regenerate kernels and update A55 versions
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I9eae76c77db03b8806af65729da34ab2d77f95f2 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4965 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp24
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp24
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp156
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp170
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp176
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp144
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp170
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp200
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp258
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp240
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp200
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp258
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp240
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp202
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp92
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp302
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp322
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp322
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp128
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp68
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp68
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp7
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp4364
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp5757
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp6689
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp18
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp3595
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp4192
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp2215
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp3175
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp2148
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp2336
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp3772
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp3289
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp3499
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp4440
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp2148
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp2336
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp3499
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp4440
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp1976
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp2516
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp1976
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp2308
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp1338
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp1903
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp1835
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp1340
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp1835
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp3
71 files changed, 54231 insertions, 28712 deletions
diff --git a/Android.bp b/Android.bp
index d3a5b0b924..dc6c702011 100644
--- a/Android.bp
+++ b/Android.bp
@@ -889,13 +889,21 @@ cc_library_static {
"src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
index 074299997d..807511f0d2 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
@@ -22,9 +22,6 @@
* SOFTWARE.
*/
#pragma once
-#if (defined(__GNUC__) && (__GNUC__ >= 7))
-#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
-#endif
#ifdef __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
index 8054c2b96b..6a8caf6ce6 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,19 +58,19 @@ void interleave_block<4, 16, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q19, [x22], #0x10\n"
- "prfm pldl1keep, [x22, #0x70]\n"
+ "subs %x[width], %x[width], #0x10\n"
"ldr q18, [x21], #0x10\n"
+ "cmp %x[width], #0x10\n"
"ldr q17, [x20], #0x10\n"
- "prfm pldl1keep, [x21, #0x70]\n"
"ldr q16, [x19], #0x10\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
"prfm pldl1keep, [x20, #0x70]\n"
+ "prfm pldl1keep, [x19, #0x70]\n"
"str q19, [%x[out_ptr], #0x0]\n"
"str q18, [%x[out_ptr], #0x10]\n"
- "prfm pldl1keep, [x19, #0x70]\n"
"str q17, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "subs %x[width], %x[width], #0x10\n"
- "cmp %x[width], #0x10\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"bge 2b\n"
"3:" // Main loop skip
@@ -171,7 +171,7 @@ void interleave_block<4, 16, VLType::None, false>(
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"12:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
index 1650916f9f..954a86656e 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -84,24 +84,24 @@ void interleave_block<4, 16, VLType::None, true>(
"mov x22, #0x0\n"
"4:" // no_accumulate_16
"ldr q19, [x23], #0x10\n"
- "prfm pldl1keep, [x23, #0x70]\n"
+ "add x22, x22, #0x1\n"
"ldr q18, [x21], #0x10\n"
+ "subs %x[width], %x[width], #0x10\n"
"ldr q17, [x20], #0x10\n"
- "prfm pldl1keep, [x21, #0x70]\n"
+ "cmp %x[width], #0x10\n"
"ldr q16, [x19], #0x10\n"
- "prfm pldl1keep, [x20, #0x70]\n"
- "str q19, [%x[out_ptr], #0x0]\n"
"sadalp v28.8h, v19.16b\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "sadalp v27.8h, v18.16b\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "sadalp v26.8h, v17.16b\n"
"prfm pldl1keep, [x19, #0x70]\n"
+ "sadalp v25.8h, v16.16b\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
"str q18, [%x[out_ptr], #0x10]\n"
- "sadalp v27.8h, v18.16b\n"
"str q17, [%x[out_ptr], #0x20]\n"
- "sadalp v26.8h, v17.16b\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "sadalp v25.8h, v16.16b\n"
- "add x22, x22, #0x1\n"
- "subs %x[width], %x[width], #0x10\n"
- "cmp %x[width], #0x10\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"bge 3b\n"
"5:" // Main loop skip
@@ -215,7 +215,7 @@ void interleave_block<4, 16, VLType::None, true>(
"add v24.4s, v24.4s, v20.4s\n"
"str q24, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
index af3efb25b2..c81146212c 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -84,24 +84,24 @@ void interleave_block<4, 16, VLType::None, true>(
"mov x22, #0x0\n"
"4:" // no_accumulate_16
"ldr q19, [x23], #0x10\n"
- "prfm pldl1keep, [x23, #0x70]\n"
+ "add x22, x22, #0x1\n"
"ldr q18, [x21], #0x10\n"
+ "subs %x[width], %x[width], #0x10\n"
"ldr q17, [x20], #0x10\n"
- "prfm pldl1keep, [x21, #0x70]\n"
+ "cmp %x[width], #0x10\n"
"ldr q16, [x19], #0x10\n"
- "prfm pldl1keep, [x20, #0x70]\n"
- "str q19, [%x[out_ptr], #0x0]\n"
"uadalp v28.8h, v19.16b\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "uadalp v27.8h, v18.16b\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "uadalp v26.8h, v17.16b\n"
"prfm pldl1keep, [x19, #0x70]\n"
+ "uadalp v25.8h, v16.16b\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
"str q18, [%x[out_ptr], #0x10]\n"
- "uadalp v27.8h, v18.16b\n"
"str q17, [%x[out_ptr], #0x20]\n"
- "uadalp v26.8h, v17.16b\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "uadalp v25.8h, v16.16b\n"
- "add x22, x22, #0x1\n"
- "subs %x[width], %x[width], #0x10\n"
- "cmp %x[width], #0x10\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"bge 3b\n"
"5:" // Main loop skip
@@ -215,7 +215,7 @@ void interleave_block<4, 16, VLType::None, true>(
"add v24.4s, v24.4s, v20.4s\n"
"str q24, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
index 34d25f27b8..42574295f1 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,7 +31,7 @@ void interleave_block<8, 1, VLType::None, false>(
)
{
__asm__ __volatile__(
- "movi v29.8h, #0x0\n"
+ "movi v30.8h, #0x0\n"
"ldr x27, [%x[in], #0x0]\n"
"cmp %x[height], #0x8\n"
"ldr x26, [%x[in], #0x8]\n"
@@ -80,53 +80,53 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x20, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr d28, [x27], #0x8\n"
- "zip1 v28.8h, v29.8h, v28.8h\n"
+ "ldr d29, [x27], #0x8\n"
+ "zip1 v29.8h, v30.8h, v29.8h\n"
+ "ldr d28, [x26], #0x8\n"
+ "subs %x[width], %x[width], #0x4\n"
+ "zip1 v28.8h, v30.8h, v28.8h\n"
+ "ldr d24, [x25], #0x8\n"
+ "cmp %x[width], #0x4\n"
+ "zip1 v24.8h, v30.8h, v24.8h\n"
+ "ldr d27, [x24], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "zip1 v25.4s, v29.4s, v24.4s\n"
+ "zip2 v24.4s, v29.4s, v24.4s\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "zip1 v27.8h, v30.8h, v27.8h\n"
+ "ldr d21, [x20], #0x8\n"
+ "zip1 v26.8h, v30.8h, v26.8h\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ldr d27, [x26], #0x8\n"
- "zip1 v27.8h, v29.8h, v27.8h\n"
+ "zip1 v20.4s, v28.4s, v27.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
- "ldr d26, [x25], #0x8\n"
- "zip1 v26.8h, v29.8h, v26.8h\n"
+ "zip1 v23.8h, v30.8h, v23.8h\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr d25, [x24], #0x8\n"
- "zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v22.8h, v30.8h, v22.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "zip1 v25.8h, v29.8h, v25.8h\n"
- "ldr d24, [x23], #0x8\n"
- "zip1 v19.4s, v27.4s, v25.4s\n"
+ "zip1 v21.8h, v30.8h, v21.8h\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "zip1 v24.8h, v29.8h, v24.8h\n"
- "ldr d23, [x22], #0x8\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
+ "zip1 v17.4s, v25.4s, v20.4s\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "zip1 v23.8h, v29.8h, v23.8h\n"
- "ldr d22, [x21], #0x8\n"
- "zip2 v19.4s, v20.4s, v19.4s\n"
+ "zip1 v19.4s, v26.4s, v22.4s\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "zip1 v22.8h, v29.8h, v22.8h\n"
- "ldr d21, [x20], #0x8\n"
- "zip1 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v18.4s, v23.4s, v21.4s\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v21.8h, v29.8h, v21.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v17.4s, v23.4s, v21.4s\n"
- "subs %x[width], %x[width], #0x4\n"
- "zip2 v20.4s, v28.4s, v26.4s\n"
- "cmp %x[width], #0x4\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.4s, v25.4s, v20.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v16.4s, v18.4s, v17.4s\n"
- "str q19, [%x[out_ptr], #0x20]\n"
- "zip2 v19.4s, v27.4s, v25.4s\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip2 v19.4s, v28.4s, v27.4s\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
+ "zip1 v16.4s, v24.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x40]\n"
- "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v18.4s, v26.4s, v22.4s\n"
"zip2 v17.4s, v23.4s, v21.4s\n"
"zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v16.4s, v20.4s, v19.4s\n"
+ "zip2 v16.4s, v24.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x60]\n"
"zip2 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x70]\n"
@@ -135,77 +135,77 @@ void interleave_block<8, 1, VLType::None, false>(
"3:" // Main loop skip
"cbz %x[width], 6f\n"
"tbz %x[width], #1, 4f\n"
- "ldr s28, [x27], #0x4\n"
- "ldr s27, [x26], #0x4\n"
- "ldr s26, [x25], #0x4\n"
- "ldr s25, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
+ "ldr s29, [x27], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "mov x19, #0x2\n"
+ "ldr s24, [x25], #0x4\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
"ldr s23, [x22], #0x4\n"
"ldr s22, [x21], #0x4\n"
"ldr s21, [x20], #0x4\n"
- "mov x19, #0x2\n"
"tbz %x[width], #0, 5f\n"
- "ld1 { v28.h }[2], [x27]\n"
- "ld1 { v27.h }[2], [x26]\n"
- "ld1 { v26.h }[2], [x25]\n"
- "ld1 { v25.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v29.h }[2], [x27]\n"
+ "mov x19, #0x3\n"
+ "ld1 { v28.h }[2], [x26]\n"
+ "ld1 { v24.h }[2], [x25]\n"
+ "ld1 { v27.h }[2], [x24]\n"
+ "ld1 { v26.h }[2], [x23]\n"
"ld1 { v23.h }[2], [x22]\n"
"ld1 { v22.h }[2], [x21]\n"
"ld1 { v21.h }[2], [x20]\n"
- "mov x19, #0x3\n"
"b 5f\n"
"4:" // odd_loads_1_0
- "ldr h28, [x27, #0x0]\n"
- "ldr h27, [x26, #0x0]\n"
- "ldr h26, [x25, #0x0]\n"
- "ldr h25, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
+ "ldr h29, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
+ "ldr h28, [x26, #0x0]\n"
+ "ldr h24, [x25, #0x0]\n"
+ "ldr h27, [x24, #0x0]\n"
+ "ldr h26, [x23, #0x0]\n"
"ldr h23, [x22, #0x0]\n"
"ldr h22, [x21, #0x0]\n"
"ldr h21, [x20, #0x0]\n"
- "mov x19, #0x1\n"
"5:" // Odd load end
- "zip1 v28.8h, v29.8h, v28.8h\n"
+ "zip1 v29.8h, v30.8h, v29.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v27.8h, v29.8h, v27.8h\n"
- "zip1 v26.8h, v29.8h, v26.8h\n"
- "zip1 v25.8h, v29.8h, v25.8h\n"
- "zip1 v24.8h, v29.8h, v24.8h\n"
- "zip1 v23.8h, v29.8h, v23.8h\n"
- "zip1 v22.8h, v29.8h, v22.8h\n"
- "zip1 v21.8h, v29.8h, v21.8h\n"
- "zip1 v20.4s, v28.4s, v26.4s\n"
- "zip1 v19.4s, v27.4s, v25.4s\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.4s, v24.4s, v22.4s\n"
- "zip1 v17.4s, v23.4s, v21.4s\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
+ "zip1 v28.8h, v30.8h, v28.8h\n"
+ "zip1 v24.8h, v30.8h, v24.8h\n"
+ "zip1 v27.8h, v30.8h, v27.8h\n"
+ "zip1 v26.8h, v30.8h, v26.8h\n"
+ "zip1 v23.8h, v30.8h, v23.8h\n"
+ "zip1 v22.8h, v30.8h, v22.8h\n"
+ "zip1 v21.8h, v30.8h, v21.8h\n"
+ "zip1 v25.4s, v29.4s, v24.4s\n"
+ "zip1 v20.4s, v28.4s, v27.4s\n"
+ "zip1 v17.4s, v25.4s, v20.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "zip1 v19.4s, v26.4s, v22.4s\n"
+ "zip1 v18.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v19.4s, v20.4s, v19.4s\n"
- "zip2 v16.4s, v18.4s, v17.4s\n"
- "str q19, [%x[out_ptr], #0x0]\n"
- "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.4s, v25.4s, v20.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
"subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v20.4s, v28.4s, v26.4s\n"
- "zip2 v19.4s, v27.4s, v25.4s\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
+ "zip2 v24.4s, v29.4s, v24.4s\n"
+ "zip2 v19.4s, v28.4s, v27.4s\n"
+ "zip1 v16.4s, v24.4s, v19.4s\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v18.4s, v26.4s, v22.4s\n"
"zip2 v17.4s, v23.4s, v21.4s\n"
"zip1 v16.4s, v18.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"6:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
index d547957129..62d1657a9a 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,51 +80,51 @@ void interleave_block<8, 1, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q30, [x27], #0x10\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "subs %x[width], %x[width], #0x8\n"
"ldr q29, [x26], #0x10\n"
+ "cmp %x[width], #0x8\n"
"ldr q28, [x25], #0x10\n"
- "prfm pldl1keep, [x26, #0x70]\n"
"ldr q27, [x24], #0x10\n"
+ "ldr q25, [x23], #0x10\n"
+ "zip1 v26.8h, v30.8h, v25.8h\n"
+ "ldr q21, [x22], #0x10\n"
+ "zip2 v25.8h, v30.8h, v25.8h\n"
+ "ldr q24, [x21], #0x10\n"
+ "ldr q23, [x20], #0x10\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "zip1 v20.8h, v28.8h, v24.8h\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr q24, [x23], #0x10\n"
- "zip1 v26.8h, v30.8h, v24.8h\n"
+ "zip1 v18.8h, v26.8h, v20.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr q25, [x22], #0x10\n"
- "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q23, [x21], #0x10\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
+ "zip1 v16.8h, v22.8h, v19.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "ldr q22, [x20], #0x10\n"
- "zip1 v18.8h, v28.8h, v23.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "subs %x[width], %x[width], #0x8\n"
- "zip1 v20.8h, v26.8h, v18.8h\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v19.8h, v27.8h, v22.8h\n"
- "cmp %x[width], #0x8\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip2 v18.8h, v26.8h, v18.8h\n"
- "zip1 v16.8h, v20.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v16.8h, v20.8h, v17.8h\n"
+ "zip2 v18.8h, v26.8h, v20.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x20]\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "zip2 v21.8h, v28.8h, v23.8h\n"
- "zip1 v18.8h, v24.8h, v21.8h\n"
- "zip2 v20.8h, v29.8h, v25.8h\n"
- "zip2 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v20.8h, v28.8h, v24.8h\n"
+ "zip1 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x40]\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v18.8h, v24.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x60]\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
@@ -138,129 +138,129 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr d29, [x26], #0x8\n"
"ldr d28, [x25], #0x8\n"
"ldr d27, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d25, [x22], #0x8\n"
- "ldr d23, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "ldr d23, [x20], #0x8\n"
"tbz %x[width], #1, 4f\n"
"ld1 { v30.s }[2], [x27], #0x4\n"
+ "mov x19, #0x6\n"
"ld1 { v29.s }[2], [x26], #0x4\n"
"ld1 { v28.s }[2], [x25], #0x4\n"
"ld1 { v27.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v25.s }[2], [x22], #0x4\n"
- "ld1 { v23.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
- "mov x19, #0x6\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v24.s }[2], [x21], #0x4\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v30.h }[6], [x27]\n"
+ "mov x19, #0x7\n"
"ld1 { v29.h }[6], [x26]\n"
"ld1 { v28.h }[6], [x25]\n"
"ld1 { v27.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "ld1 { v25.h }[6], [x22]\n"
- "ld1 { v23.h }[6], [x21]\n"
- "ld1 { v22.h }[6], [x20]\n"
- "mov x19, #0x7\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v21.h }[6], [x22]\n"
+ "ld1 { v24.h }[6], [x21]\n"
+ "ld1 { v23.h }[6], [x20]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x19, #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v30.h }[4], [x27]\n"
"ld1 { v29.h }[4], [x26]\n"
+ "mov x19, #0x5\n"
"ld1 { v28.h }[4], [x25]\n"
"ld1 { v27.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "ld1 { v25.h }[4], [x22]\n"
- "ld1 { v23.h }[4], [x21]\n"
- "ld1 { v22.h }[4], [x20]\n"
- "mov x19, #0x5\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v21.h }[4], [x22]\n"
+ "ld1 { v24.h }[4], [x21]\n"
+ "ld1 { v23.h }[4], [x20]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
"ldr s30, [x27], #0x4\n"
"ldr s29, [x26], #0x4\n"
+ "mov x19, #0x2\n"
"ldr s28, [x25], #0x4\n"
"ldr s27, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s25, [x22], #0x4\n"
- "ldr s23, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
- "mov x19, #0x2\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s24, [x21], #0x4\n"
+ "ldr s23, [x20], #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v30.h }[2], [x27]\n"
+ "mov x19, #0x3\n"
"ld1 { v29.h }[2], [x26]\n"
"ld1 { v28.h }[2], [x25]\n"
"ld1 { v27.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v25.h }[2], [x22]\n"
- "ld1 { v23.h }[2], [x21]\n"
- "ld1 { v22.h }[2], [x20]\n"
- "mov x19, #0x3\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v21.h }[2], [x22]\n"
+ "ld1 { v24.h }[2], [x21]\n"
+ "ld1 { v23.h }[2], [x20]\n"
"b 7f\n"
"6:" // odd_loads_1_0
"ldr h30, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr h29, [x26, #0x0]\n"
"ldr h28, [x25, #0x0]\n"
"ldr h27, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h25, [x22, #0x0]\n"
- "ldr h23, [x21, #0x0]\n"
- "ldr h22, [x20, #0x0]\n"
- "mov x19, #0x1\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h21, [x22, #0x0]\n"
+ "ldr h24, [x21, #0x0]\n"
+ "ldr h23, [x20, #0x0]\n"
"7:" // Odd load end
- "zip1 v26.8h, v30.8h, v24.8h\n"
+ "zip1 v26.8h, v30.8h, v25.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v28.8h, v23.8h\n"
- "zip1 v20.8h, v26.8h, v18.8h\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
- "zip1 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v20.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.8h, v28.8h, v24.8h\n"
+ "zip1 v18.8h, v26.8h, v20.8h\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v16.8h, v22.8h, v19.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v16.8h, v20.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v26.8h, v18.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip2 v18.8h, v26.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
"subs x19, x19, #0x1\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v24.8h, v30.8h, v24.8h\n"
- "zip2 v21.8h, v28.8h, v23.8h\n"
+ "zip2 v25.8h, v30.8h, v25.8h\n"
+ "zip2 v20.8h, v28.8h, v24.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v24.8h, v21.8h\n"
- "zip2 v20.8h, v29.8h, v25.8h\n"
- "zip2 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v24.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"8:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
index b45e622a47..b67840b280 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -79,132 +79,132 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x20, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr d29, [x27], #0x8\n"
+ "ldr d30, [x27], #0x8\n"
+ "subs %x[width], %x[width], #0x4\n"
+ "ldr d29, [x26], #0x8\n"
+ "cmp %x[width], #0x4\n"
+ "ldr d28, [x25], #0x8\n"
+ "fcvtl v30.4s, v30.4h\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "fcvtl v29.4s, v29.4h\n"
+ "ldr d26, [x22], #0x8\n"
+ "fcvtl v28.4s, v28.4h\n"
+ "zip1 v20.4s, v30.4s, v28.4s\n"
+ "ldr d25, [x21], #0x8\n"
+ "fcvtl v21.4s, v21.4h\n"
+ "zip2 v17.4s, v30.4s, v28.4s\n"
+ "ldr d24, [x20], #0x8\n"
+ "fcvtl v27.4s, v27.4h\n"
+ "zip1 v18.4s, v29.4s, v21.4s\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ldr d28, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
+ "fcvtl v26.4s, v26.4h\n"
+ "zip1 v23.4s, v20.4s, v18.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
- "ldr d26, [x24], #0x8\n"
+ "fcvtl v25.4s, v25.4h\n"
+ "zip2 v22.4s, v20.4s, v18.4s\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d24, [x22], #0x8\n"
+ "fcvtl v24.4s, v24.4h\n"
+ "zip2 v16.4s, v29.4s, v21.4s\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr d23, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v21.4s, v17.4s, v16.4s\n"
+ "zip2 v20.4s, v17.4s, v16.4s\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "fcvtl v29.4s, v29.4h\n"
- "fcvtl v28.4s, v28.4h\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "fcvtl v27.4s, v27.4h\n"
- "zip1 v20.4s, v29.4s, v27.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
+ "zip2 v18.4s, v27.4s, v25.4s\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "fcvtl v26.4s, v26.4h\n"
- "zip2 v18.4s, v29.4s, v27.4s\n"
- "fcvtl v25.4s, v25.4h\n"
- "fcvtl v24.4s, v24.4h\n"
- "zip1 v19.4s, v28.4s, v26.4s\n"
- "fcvtl v23.4s, v23.4h\n"
- "zip2 v17.4s, v28.4s, v26.4s\n"
- "fcvtl v22.4s, v22.4h\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v21.4s, v20.4s, v19.4s\n"
- "subs %x[width], %x[width], #0x4\n"
- "zip1 v20.4s, v18.4s, v17.4s\n"
- "cmp %x[width], #0x4\n"
- "zip2 v19.4s, v18.4s, v17.4s\n"
- "zip1 v18.4s, v25.4s, v23.4s\n"
- "zip1 v17.4s, v24.4s, v22.4s\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
+ "zip1 v17.4s, v26.4s, v24.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "zip1 v16.4s, v19.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v16.4s, v18.4s, v17.4s\n"
- "str q21, [%x[out_ptr], #0x20]\n"
- "zip2 v18.4s, v25.4s, v23.4s\n"
- "str q16, [%x[out_ptr], #0x30]\n"
- "zip2 v17.4s, v24.4s, v22.4s\n"
- "str q20, [%x[out_ptr], #0x40]\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
- "str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v16.4s, v18.4s, v17.4s\n"
- "str q19, [%x[out_ptr], #0x60]\n"
+ "zip2 v17.4s, v19.4s, v17.4s\n"
+ "str q22, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "str q21, [%x[out_ptr], #0x40]\n"
+ "zip2 v16.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x50]\n"
+ "str q20, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 2b\n"
"3:" // Main loop skip
"cbz %x[width], 6f\n"
"tbz %x[width], #1, 4f\n"
- "ldr s29, [x27], #0x4\n"
- "ldr s28, [x26], #0x4\n"
- "ldr s27, [x25], #0x4\n"
- "ldr s26, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
- "ldr s24, [x22], #0x4\n"
- "ldr s23, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
"mov x19, #0x2\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s27, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
+ "ldr s25, [x21], #0x4\n"
+ "ldr s24, [x20], #0x4\n"
"tbz %x[width], #0, 5f\n"
- "ld1 { v29.h }[2], [x27]\n"
- "ld1 { v28.h }[2], [x26]\n"
- "ld1 { v27.h }[2], [x25]\n"
- "ld1 { v26.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
- "ld1 { v24.h }[2], [x22]\n"
- "ld1 { v23.h }[2], [x21]\n"
- "ld1 { v22.h }[2], [x20]\n"
+ "ld1 { v30.h }[2], [x27]\n"
"mov x19, #0x3\n"
+ "ld1 { v29.h }[2], [x26]\n"
+ "ld1 { v28.h }[2], [x25]\n"
+ "ld1 { v21.h }[2], [x24]\n"
+ "ld1 { v27.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
+ "ld1 { v25.h }[2], [x21]\n"
+ "ld1 { v24.h }[2], [x20]\n"
"b 5f\n"
"4:" // odd_loads_1_0
- "ldr h29, [x27, #0x0]\n"
- "ldr h28, [x26, #0x0]\n"
- "ldr h27, [x25, #0x0]\n"
- "ldr h26, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
- "ldr h24, [x22, #0x0]\n"
- "ldr h23, [x21, #0x0]\n"
- "ldr h22, [x20, #0x0]\n"
+ "ldr h30, [x27, #0x0]\n"
"mov x19, #0x1\n"
+ "ldr h29, [x26, #0x0]\n"
+ "ldr h28, [x25, #0x0]\n"
+ "ldr h21, [x24, #0x0]\n"
+ "ldr h27, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
+ "ldr h25, [x21, #0x0]\n"
+ "ldr h24, [x20, #0x0]\n"
"5:" // Odd load end
+ "fcvtl v30.4s, v30.4h\n"
"fcvtl v29.4s, v29.4h\n"
"fcvtl v28.4s, v28.4h\n"
+ "zip1 v20.4s, v30.4s, v28.4s\n"
+ "fcvtl v21.4s, v21.4h\n"
"fcvtl v27.4s, v27.4h\n"
- "zip1 v20.4s, v29.4s, v27.4s\n"
+ "zip1 v18.4s, v29.4s, v21.4s\n"
"fcvtl v26.4s, v26.4h\n"
"fcvtl v25.4s, v25.4h\n"
- "zip1 v19.4s, v28.4s, v26.4s\n"
+ "zip1 v23.4s, v20.4s, v18.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
"fcvtl v24.4s, v24.4h\n"
- "fcvtl v23.4s, v23.4h\n"
- "zip1 v16.4s, v20.4s, v19.4s\n"
- "fcvtl v22.4s, v22.4h\n"
- "zip1 v18.4s, v25.4s, v23.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
- "zip1 v17.4s, v24.4s, v22.4s\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
+ "zip1 v17.4s, v26.4s, v24.4s\n"
+ "zip1 v16.4s, v19.4s, v17.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v21.4s, v20.4s, v19.4s\n"
- "zip2 v16.4s, v18.4s, v17.4s\n"
- "str q21, [%x[out_ptr], #0x0]\n"
- "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v22.4s, v20.4s, v18.4s\n"
+ "str q22, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.4s, v19.4s, v17.4s\n"
"subs x19, x19, #0x1\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v18.4s, v29.4s, v27.4s\n"
- "zip2 v17.4s, v28.4s, v26.4s\n"
- "zip1 v20.4s, v18.4s, v17.4s\n"
- "str q20, [%x[out_ptr], #0x0]\n"
- "zip2 v18.4s, v25.4s, v23.4s\n"
- "zip2 v17.4s, v24.4s, v22.4s\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
- "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.4s, v30.4s, v28.4s\n"
+ "zip2 v16.4s, v29.4s, v21.4s\n"
+ "zip1 v21.4s, v17.4s, v16.4s\n"
+ "str q21, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.4s, v27.4s, v25.4s\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"6:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
index 3f38859c1c..eefb8549ea 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,45 +80,45 @@ void interleave_block<8, 1, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q28, [x27], #0x10\n"
+ "subs %x[width], %x[width], #0x4\n"
+ "ldr q29, [x26], #0x10\n"
+ "cmp %x[width], #0x4\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "ldr q27, [x23], #0x10\n"
+ "ldr q26, [x22], #0x10\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
+ "ldr q24, [x20], #0x10\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ldr q27, [x26], #0x10\n"
- "ldr q26, [x25], #0x10\n"
- "zip1 v23.4s, v28.4s, v26.4s\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
- "ldr q22, [x24], #0x10\n"
- "zip2 v26.4s, v28.4s, v26.4s\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr q25, [x23], #0x10\n"
- "zip1 v20.4s, v27.4s, v22.4s\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr q24, [x22], #0x10\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q19, [x21], #0x10\n"
- "zip2 v23.4s, v23.4s, v20.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "zip2 v22.4s, v27.4s, v22.4s\n"
- "ldr q21, [x20], #0x10\n"
- "zip1 v18.4s, v25.4s, v19.4s\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v20.4s, v26.4s, v22.4s\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v16.4s, v24.4s, v21.4s\n"
- "subs %x[width], %x[width], #0x4\n"
- "zip1 v17.4s, v18.4s, v16.4s\n"
- "cmp %x[width], #0x4\n"
- "zip2 v16.4s, v18.4s, v16.4s\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
"str q17, [%x[out_ptr], #0x10]\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
- "str q23, [%x[out_ptr], #0x20]\n"
- "zip2 v18.4s, v24.4s, v21.4s\n"
- "str q16, [%x[out_ptr], #0x30]\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q20, [%x[out_ptr], #0x40]\n"
- "zip2 v17.4s, v26.4s, v22.4s\n"
- "str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v17.4s, v28.4s, v25.4s\n"
+ "str q22, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
+ "str q20, [%x[out_ptr], #0x30]\n"
+ "str q21, [%x[out_ptr], #0x40]\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
"str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
@@ -127,68 +127,68 @@ void interleave_block<8, 1, VLType::None, false>(
"cbz %x[width], 6f\n"
"tbz %x[width], #1, 4f\n"
"ldr d28, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d26, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d24, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
- "ldr d21, [x20], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
"mov x19, #0x2\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
"tbz %x[width], #0, 5f\n"
"ld1 { v28.s }[2], [x27]\n"
- "ld1 { v27.s }[2], [x26]\n"
- "ld1 { v26.s }[2], [x25]\n"
- "ld1 { v22.s }[2], [x24]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v24.s }[2], [x22]\n"
- "ld1 { v19.s }[2], [x21]\n"
- "ld1 { v21.s }[2], [x20]\n"
"mov x19, #0x3\n"
+ "ld1 { v29.s }[2], [x26]\n"
+ "ld1 { v25.s }[2], [x25]\n"
+ "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v19.s }[2], [x21]\n"
+ "ld1 { v24.s }[2], [x20]\n"
"b 5f\n"
"4:" // odd_loads_1_0
"ldr s28, [x27, #0x0]\n"
- "ldr s27, [x26, #0x0]\n"
- "ldr s26, [x25, #0x0]\n"
- "ldr s22, [x24, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s24, [x22, #0x0]\n"
- "ldr s19, [x21, #0x0]\n"
- "ldr s21, [x20, #0x0]\n"
"mov x19, #0x1\n"
+ "ldr s29, [x26, #0x0]\n"
+ "ldr s25, [x25, #0x0]\n"
+ "ldr s21, [x24, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "ldr s19, [x21, #0x0]\n"
+ "ldr s24, [x20, #0x0]\n"
"5:" // Odd load end
- "zip1 v23.4s, v28.4s, v26.4s\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
"subs x19, x19, #0x1\n"
- "zip1 v20.4s, v27.4s, v22.4s\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.4s, v25.4s, v19.4s\n"
- "zip1 v16.4s, v24.4s, v21.4s\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
"zip1 v17.4s, v18.4s, v16.4s\n"
"str q17, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v23.4s, v23.4s, v20.4s\n"
- "zip2 v16.4s, v18.4s, v16.4s\n"
- "str q23, [%x[out_ptr], #0x0]\n"
- "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
+ "str q22, [%x[out_ptr], #0x0]\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
"subs x19, x19, #0x1\n"
+ "str q20, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 6f\n"
- "zip2 v26.4s, v28.4s, v26.4s\n"
- "zip2 v22.4s, v27.4s, v22.4s\n"
- "zip1 v20.4s, v26.4s, v22.4s\n"
- "str q20, [%x[out_ptr], #0x0]\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
- "zip2 v18.4s, v24.4s, v21.4s\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
+ "str q21, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"6:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
index 03f552a575..b0523b96ce 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,51 +80,51 @@ void interleave_block<8, 1, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q30, [x27], #0x10\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "subs %x[width], %x[width], #0x8\n"
"ldr q29, [x26], #0x10\n"
+ "cmp %x[width], #0x8\n"
"ldr q28, [x25], #0x10\n"
- "prfm pldl1keep, [x26, #0x70]\n"
"ldr q27, [x24], #0x10\n"
+ "ldr q25, [x23], #0x10\n"
+ "zip1 v26.8h, v30.8h, v25.8h\n"
+ "ldr q21, [x22], #0x10\n"
+ "zip2 v25.8h, v30.8h, v25.8h\n"
+ "ldr q24, [x21], #0x10\n"
+ "ldr q23, [x20], #0x10\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "zip1 v20.8h, v28.8h, v24.8h\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr q24, [x23], #0x10\n"
- "zip1 v26.8h, v30.8h, v24.8h\n"
+ "zip1 v18.8h, v26.8h, v20.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr q25, [x22], #0x10\n"
- "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q23, [x21], #0x10\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
+ "zip1 v16.8h, v22.8h, v19.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "ldr q22, [x20], #0x10\n"
- "zip1 v18.8h, v28.8h, v23.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "subs %x[width], %x[width], #0x8\n"
- "zip1 v20.8h, v26.8h, v18.8h\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v19.8h, v27.8h, v22.8h\n"
- "cmp %x[width], #0x8\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip2 v18.8h, v26.8h, v18.8h\n"
- "zip1 v16.8h, v20.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v16.8h, v20.8h, v17.8h\n"
+ "zip2 v18.8h, v26.8h, v20.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x20]\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "zip2 v21.8h, v28.8h, v23.8h\n"
- "zip1 v18.8h, v24.8h, v21.8h\n"
- "zip2 v20.8h, v29.8h, v25.8h\n"
- "zip2 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v20.8h, v28.8h, v24.8h\n"
+ "zip1 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x40]\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v18.8h, v24.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x60]\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
@@ -138,129 +138,129 @@ void interleave_block<8, 1, VLType::None, false>(
"ldr d29, [x26], #0x8\n"
"ldr d28, [x25], #0x8\n"
"ldr d27, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d25, [x22], #0x8\n"
- "ldr d23, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "ldr d23, [x20], #0x8\n"
"tbz %x[width], #1, 4f\n"
"ld1 { v30.s }[2], [x27], #0x4\n"
+ "mov x19, #0x6\n"
"ld1 { v29.s }[2], [x26], #0x4\n"
"ld1 { v28.s }[2], [x25], #0x4\n"
"ld1 { v27.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v25.s }[2], [x22], #0x4\n"
- "ld1 { v23.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
- "mov x19, #0x6\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v24.s }[2], [x21], #0x4\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v30.h }[6], [x27]\n"
+ "mov x19, #0x7\n"
"ld1 { v29.h }[6], [x26]\n"
"ld1 { v28.h }[6], [x25]\n"
"ld1 { v27.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "ld1 { v25.h }[6], [x22]\n"
- "ld1 { v23.h }[6], [x21]\n"
- "ld1 { v22.h }[6], [x20]\n"
- "mov x19, #0x7\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v21.h }[6], [x22]\n"
+ "ld1 { v24.h }[6], [x21]\n"
+ "ld1 { v23.h }[6], [x20]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x19, #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v30.h }[4], [x27]\n"
"ld1 { v29.h }[4], [x26]\n"
+ "mov x19, #0x5\n"
"ld1 { v28.h }[4], [x25]\n"
"ld1 { v27.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "ld1 { v25.h }[4], [x22]\n"
- "ld1 { v23.h }[4], [x21]\n"
- "ld1 { v22.h }[4], [x20]\n"
- "mov x19, #0x5\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v21.h }[4], [x22]\n"
+ "ld1 { v24.h }[4], [x21]\n"
+ "ld1 { v23.h }[4], [x20]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
"ldr s30, [x27], #0x4\n"
"ldr s29, [x26], #0x4\n"
+ "mov x19, #0x2\n"
"ldr s28, [x25], #0x4\n"
"ldr s27, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s25, [x22], #0x4\n"
- "ldr s23, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
- "mov x19, #0x2\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s24, [x21], #0x4\n"
+ "ldr s23, [x20], #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v30.h }[2], [x27]\n"
+ "mov x19, #0x3\n"
"ld1 { v29.h }[2], [x26]\n"
"ld1 { v28.h }[2], [x25]\n"
"ld1 { v27.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v25.h }[2], [x22]\n"
- "ld1 { v23.h }[2], [x21]\n"
- "ld1 { v22.h }[2], [x20]\n"
- "mov x19, #0x3\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v21.h }[2], [x22]\n"
+ "ld1 { v24.h }[2], [x21]\n"
+ "ld1 { v23.h }[2], [x20]\n"
"b 7f\n"
"6:" // odd_loads_1_0
"ldr h30, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr h29, [x26, #0x0]\n"
"ldr h28, [x25, #0x0]\n"
"ldr h27, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h25, [x22, #0x0]\n"
- "ldr h23, [x21, #0x0]\n"
- "ldr h22, [x20, #0x0]\n"
- "mov x19, #0x1\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h21, [x22, #0x0]\n"
+ "ldr h24, [x21, #0x0]\n"
+ "ldr h23, [x20, #0x0]\n"
"7:" // Odd load end
- "zip1 v26.8h, v30.8h, v24.8h\n"
+ "zip1 v26.8h, v30.8h, v25.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v28.8h, v23.8h\n"
- "zip1 v20.8h, v26.8h, v18.8h\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
- "zip1 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v20.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.8h, v28.8h, v24.8h\n"
+ "zip1 v18.8h, v26.8h, v20.8h\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v16.8h, v22.8h, v19.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v16.8h, v20.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v26.8h, v18.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip2 v18.8h, v26.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
"subs x19, x19, #0x1\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v24.8h, v30.8h, v24.8h\n"
- "zip2 v21.8h, v28.8h, v23.8h\n"
+ "zip2 v25.8h, v30.8h, v25.8h\n"
+ "zip2 v20.8h, v28.8h, v24.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v24.8h, v21.8h\n"
- "zip2 v20.8h, v29.8h, v25.8h\n"
- "zip2 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v24.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"8:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
index 35c7719de7..292a38f401 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -96,58 +96,58 @@ void interleave_block<8, 1, VLType::None, true>(
"movi v1.8h, #0x0\n"
"4:" // no_accumulate_16
"ldr q30, [x27], #0x10\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "add x19, x19, #0x1\n"
"ldr q29, [x26], #0x10\n"
+ "subs %x[width], %x[width], #0x8\n"
"ldr q28, [x25], #0x10\n"
- "prfm pldl1keep, [x26, #0x70]\n"
+ "cmp %x[width], #0x8\n"
"ldr q27, [x24], #0x10\n"
+ "ldr q25, [x23], #0x10\n"
+ "zip1 v26.8h, v30.8h, v25.8h\n"
+ "ldr q21, [x22], #0x10\n"
+ "zip2 v25.8h, v30.8h, v25.8h\n"
+ "ldr q24, [x21], #0x10\n"
+ "ldr q23, [x20], #0x10\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "zip1 v20.8h, v28.8h, v24.8h\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr q24, [x23], #0x10\n"
- "zip1 v26.8h, v30.8h, v24.8h\n"
+ "zip1 v18.8h, v26.8h, v20.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr q25, [x22], #0x10\n"
- "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q23, [x21], #0x10\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
+ "zip1 v16.8h, v22.8h, v19.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "ldr q22, [x20], #0x10\n"
- "zip1 v18.8h, v28.8h, v23.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "add x19, x19, #0x1\n"
- "zip1 v20.8h, v26.8h, v18.8h\n"
- "prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v19.8h, v27.8h, v22.8h\n"
- "subs %x[width], %x[width], #0x8\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "cmp %x[width], #0x8\n"
- "zip2 v18.8h, v26.8h, v18.8h\n"
- "zip1 v16.8h, v20.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v17.8h, v20.8h, v17.8h\n"
- "str q17, [%x[out_ptr], #0x10]\n"
- "zip2 v16.8h, v21.8h, v19.8h\n"
"add v1.8h, v1.8h, v17.8h\n"
- "zip1 v17.8h, v18.8h, v16.8h\n"
- "str q17, [%x[out_ptr], #0x20]\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
"zip2 v16.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.8h, v26.8h, v20.8h\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "add v1.8h, v1.8h, v17.8h\n"
- "zip2 v21.8h, v28.8h, v23.8h\n"
- "zip1 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v20.8h, v28.8h, v24.8h\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip2 v20.8h, v29.8h, v25.8h\n"
- "zip2 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x40]\n"
"add v1.8h, v1.8h, v16.8h\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v18.8h, v25.8h, v20.8h\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x60]\n"
"add v1.8h, v1.8h, v16.8h\n"
@@ -163,140 +163,140 @@ void interleave_block<8, 1, VLType::None, true>(
"ldr d29, [x26], #0x8\n"
"ldr d28, [x25], #0x8\n"
"ldr d27, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d25, [x22], #0x8\n"
- "ldr d23, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "ldr d23, [x20], #0x8\n"
"tbz %x[width], #1, 6f\n"
"ld1 { v30.s }[2], [x27], #0x4\n"
+ "mov x19, #0x6\n"
"ld1 { v29.s }[2], [x26], #0x4\n"
"ld1 { v28.s }[2], [x25], #0x4\n"
"ld1 { v27.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v25.s }[2], [x22], #0x4\n"
- "ld1 { v23.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
- "mov x19, #0x6\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v24.s }[2], [x21], #0x4\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.h }[6], [x27]\n"
+ "mov x19, #0x7\n"
"ld1 { v29.h }[6], [x26]\n"
"ld1 { v28.h }[6], [x25]\n"
"ld1 { v27.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "ld1 { v25.h }[6], [x22]\n"
- "ld1 { v23.h }[6], [x21]\n"
- "ld1 { v22.h }[6], [x20]\n"
- "mov x19, #0x7\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v21.h }[6], [x22]\n"
+ "ld1 { v24.h }[6], [x21]\n"
+ "ld1 { v23.h }[6], [x20]\n"
"b 9f\n"
"6:" // odd_loads_1_4
"mov x19, #0x4\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.h }[4], [x27]\n"
"ld1 { v29.h }[4], [x26]\n"
+ "mov x19, #0x5\n"
"ld1 { v28.h }[4], [x25]\n"
"ld1 { v27.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "ld1 { v25.h }[4], [x22]\n"
- "ld1 { v23.h }[4], [x21]\n"
- "ld1 { v22.h }[4], [x20]\n"
- "mov x19, #0x5\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v21.h }[4], [x22]\n"
+ "ld1 { v24.h }[4], [x21]\n"
+ "ld1 { v23.h }[4], [x20]\n"
"b 9f\n"
"7:" // odd_loads_2_0
"tbz %x[width], #1, 8f\n"
"ldr s30, [x27], #0x4\n"
"ldr s29, [x26], #0x4\n"
+ "mov x19, #0x2\n"
"ldr s28, [x25], #0x4\n"
"ldr s27, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s25, [x22], #0x4\n"
- "ldr s23, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
- "mov x19, #0x2\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s24, [x21], #0x4\n"
+ "ldr s23, [x20], #0x4\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.h }[2], [x27]\n"
+ "mov x19, #0x3\n"
"ld1 { v29.h }[2], [x26]\n"
"ld1 { v28.h }[2], [x25]\n"
"ld1 { v27.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v25.h }[2], [x22]\n"
- "ld1 { v23.h }[2], [x21]\n"
- "ld1 { v22.h }[2], [x20]\n"
- "mov x19, #0x3\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v21.h }[2], [x22]\n"
+ "ld1 { v24.h }[2], [x21]\n"
+ "ld1 { v23.h }[2], [x20]\n"
"b 9f\n"
"8:" // odd_loads_1_0
"ldr h30, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr h29, [x26, #0x0]\n"
"ldr h28, [x25, #0x0]\n"
"ldr h27, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h25, [x22, #0x0]\n"
- "ldr h23, [x21, #0x0]\n"
- "ldr h22, [x20, #0x0]\n"
- "mov x19, #0x1\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h21, [x22, #0x0]\n"
+ "ldr h24, [x21, #0x0]\n"
+ "ldr h23, [x20, #0x0]\n"
"9:" // Odd load end
- "zip1 v26.8h, v30.8h, v24.8h\n"
+ "zip1 v26.8h, v30.8h, v25.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v28.8h, v23.8h\n"
- "zip1 v20.8h, v26.8h, v18.8h\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
- "zip1 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v20.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
+ "zip1 v20.8h, v28.8h, v24.8h\n"
+ "zip1 v18.8h, v26.8h, v20.8h\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v16.8h, v22.8h, v19.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"beq 10f\n"
- "zip2 v17.8h, v20.8h, v17.8h\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
- "add v1.8h, v1.8h, v17.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v26.8h, v18.8h\n"
- "zip2 v16.8h, v21.8h, v19.8h\n"
+ "zip2 v18.8h, v26.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v17.8h, v18.8h, v16.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v17.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"beq 10f\n"
- "zip2 v16.8h, v18.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
"add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v24.8h, v30.8h, v24.8h\n"
- "zip2 v21.8h, v28.8h, v23.8h\n"
+ "zip2 v25.8h, v30.8h, v25.8h\n"
+ "zip2 v20.8h, v28.8h, v24.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v24.8h, v21.8h\n"
- "zip2 v20.8h, v29.8h, v25.8h\n"
- "zip2 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"beq 10f\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
"add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v24.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"10:" // Odds skip
"saddw v0.4s, v0.4s, v1.4h\n"
"str q0, [%x[out_ptr], #0x0]\n"
"saddw2 v31.4s, v31.4s, v1.8h\n"
"str q31, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
index 582836fe67..6cfed8f3a4 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -79,206 +79,206 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x20, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr d30, [x27], #0x8\n"
+ "ldr d31, [x27], #0x8\n"
+ "sshll v31.8h, v31.8b, #0x0\n"
+ "ldr d30, [x26], #0x8\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "sshll v30.8h, v30.8b, #0x0\n"
+ "ldr d29, [x25], #0x8\n"
+ "cmp %x[width], #0x8\n"
+ "sshll v29.8h, v29.8b, #0x0\n"
+ "ldr d28, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "sshll v28.8h, v28.8b, #0x0\n"
+ "ldr d23, [x22], #0x8\n"
+ "sshll v25.8h, v25.8b, #0x0\n"
+ "ldr d27, [x21], #0x8\n"
+ "zip1 v20.8h, v31.8h, v25.8h\n"
+ "ldr d26, [x20], #0x8\n"
+ "zip2 v25.8h, v31.8h, v25.8h\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ldr d29, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
"prfm pldl1keep, [x26, #0x70]\n"
- "ldr d27, [x24], #0x8\n"
+ "sshll v23.8h, v23.8b, #0x0\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr d23, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr d26, [x21], #0x8\n"
- "ldr d25, [x20], #0x8\n"
+ "sshll v27.8h, v27.8b, #0x0\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v19.8h, v29.8h, v27.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "sshll v30.8h, v30.8b, #0x0\n"
- "sshll v29.8h, v29.8b, #0x0\n"
+ "zip1 v22.8h, v20.8h, v19.8h\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "sshll v28.8h, v28.8b, #0x0\n"
+ "zip2 v21.8h, v20.8h, v19.8h\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "sshll v27.8h, v27.8b, #0x0\n"
- "sshll v23.8h, v23.8b, #0x0\n"
- "zip1 v24.8h, v30.8h, v23.8h\n"
- "sshll v21.8h, v21.8b, #0x0\n"
- "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v19.8h, v29.8h, v27.8h\n"
+ "zip1 v20.8h, v25.8h, v19.8h\n"
+ "zip2 v19.8h, v25.8h, v19.8h\n"
"sshll v26.8h, v26.8b, #0x0\n"
- "sshll v25.8h, v25.8b, #0x0\n"
- "zip1 v22.8h, v29.8h, v21.8h\n"
- "subs %x[width], %x[width], #0x8\n"
- "zip2 v21.8h, v29.8h, v21.8h\n"
- "cmp %x[width], #0x8\n"
- "zip1 v20.8h, v28.8h, v26.8h\n"
- "zip1 v18.8h, v24.8h, v20.8h\n"
- "zip1 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v17.8h, v24.8h, v18.8h\n"
+ "zip1 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v18.8h, v24.8h, v20.8h\n"
- "zip2 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v24.8h, v18.8h\n"
+ "zip1 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x20]\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "zip2 v20.8h, v28.8h, v26.8h\n"
- "zip1 v18.8h, v23.8h, v20.8h\n"
- "zip2 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v17.8h, v23.8h, v18.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x40]\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v20.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v18.8h, v23.8h, v20.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v23.8h, v18.8h\n"
+ "zip1 v16.8h, v19.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x60]\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v19.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 2b\n"
"3:" // Main loop skip
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
- "ldr s30, [x27], #0x4\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s27, [x24], #0x4\n"
- "ldr s23, [x23], #0x4\n"
- "ldr s21, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
- "ldr s25, [x20], #0x4\n"
+ "ldr s31, [x27], #0x4\n"
+ "ldr s30, [x26], #0x4\n"
+ "ldr s29, [x25], #0x4\n"
+ "ldr s28, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "ldr s27, [x21], #0x4\n"
+ "ldr s26, [x20], #0x4\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x25], #0x2\n"
- "ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "ld1 { v21.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
"mov x19, #0x6\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v30.b }[6], [x27]\n"
- "ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x25]\n"
- "ld1 { v27.b }[6], [x24]\n"
- "ld1 { v23.b }[6], [x23]\n"
- "ld1 { v21.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v31.b }[6], [x27]\n"
"mov x19, #0x7\n"
+ "ld1 { v30.b }[6], [x26]\n"
+ "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x19, #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v30.b }[4], [x27]\n"
- "ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x25]\n"
- "ld1 { v27.b }[4], [x24]\n"
- "ld1 { v23.b }[4], [x23]\n"
- "ld1 { v21.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v31.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x26]\n"
"mov x19, #0x5\n"
+ "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
- "ldr h30, [x27], #0x2\n"
- "ldr h29, [x26], #0x2\n"
- "ldr h28, [x25], #0x2\n"
- "ldr h27, [x24], #0x2\n"
- "ldr h23, [x23], #0x2\n"
- "ldr h21, [x22], #0x2\n"
- "ldr h26, [x21], #0x2\n"
- "ldr h25, [x20], #0x2\n"
+ "ldr h31, [x27], #0x2\n"
+ "ldr h30, [x26], #0x2\n"
"mov x19, #0x2\n"
+ "ldr h29, [x25], #0x2\n"
+ "ldr h28, [x24], #0x2\n"
+ "ldr h25, [x23], #0x2\n"
+ "ldr h23, [x22], #0x2\n"
+ "ldr h27, [x21], #0x2\n"
+ "ldr h26, [x20], #0x2\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v30.b }[2], [x27]\n"
- "ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x25]\n"
- "ld1 { v27.b }[2], [x24]\n"
- "ld1 { v23.b }[2], [x23]\n"
- "ld1 { v21.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v31.b }[2], [x27]\n"
"mov x19, #0x3\n"
+ "ld1 { v30.b }[2], [x26]\n"
+ "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 7f\n"
"6:" // odd_loads_1_0
- "ldr b30, [x27, #0x0]\n"
- "ldr b29, [x26, #0x0]\n"
- "ldr b28, [x25, #0x0]\n"
- "ldr b27, [x24, #0x0]\n"
- "ldr b23, [x23, #0x0]\n"
- "ldr b21, [x22, #0x0]\n"
- "ldr b26, [x21, #0x0]\n"
- "ldr b25, [x20, #0x0]\n"
+ "ldr b31, [x27, #0x0]\n"
"mov x19, #0x1\n"
+ "ldr b30, [x26, #0x0]\n"
+ "ldr b29, [x25, #0x0]\n"
+ "ldr b28, [x24, #0x0]\n"
+ "ldr b25, [x23, #0x0]\n"
+ "ldr b23, [x22, #0x0]\n"
+ "ldr b27, [x21, #0x0]\n"
+ "ldr b26, [x20, #0x0]\n"
"7:" // Odd load end
+ "sshll v31.8h, v31.8b, #0x0\n"
+ "subs x19, x19, #0x1\n"
"sshll v30.8h, v30.8b, #0x0\n"
"sshll v29.8h, v29.8b, #0x0\n"
"sshll v28.8h, v28.8b, #0x0\n"
- "sshll v27.8h, v27.8b, #0x0\n"
+ "sshll v25.8h, v25.8b, #0x0\n"
+ "zip1 v20.8h, v31.8h, v25.8h\n"
"sshll v23.8h, v23.8b, #0x0\n"
"zip1 v24.8h, v30.8h, v23.8h\n"
- "sshll v21.8h, v21.8b, #0x0\n"
+ "sshll v27.8h, v27.8b, #0x0\n"
+ "zip1 v19.8h, v29.8h, v27.8h\n"
+ "zip1 v22.8h, v20.8h, v19.8h\n"
"sshll v26.8h, v26.8b, #0x0\n"
- "zip1 v20.8h, v28.8h, v26.8h\n"
- "sshll v25.8h, v25.8b, #0x0\n"
- "zip1 v22.8h, v29.8h, v21.8h\n"
- "subs x19, x19, #0x1\n"
- "zip1 v18.8h, v24.8h, v20.8h\n"
- "zip1 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v17.8h, v24.8h, v18.8h\n"
+ "zip1 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
+ "zip2 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v24.8h, v20.8h\n"
- "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip2 v21.8h, v20.8h, v19.8h\n"
+ "zip2 v17.8h, v24.8h, v18.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
+ "zip2 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v23.8h, v30.8h, v23.8h\n"
- "zip2 v20.8h, v28.8h, v26.8h\n"
+ "zip2 v25.8h, v31.8h, v25.8h\n"
+ "zip2 v19.8h, v29.8h, v27.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v23.8h, v20.8h\n"
- "zip2 v21.8h, v29.8h, v21.8h\n"
- "zip2 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v20.8h, v25.8h, v19.8h\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v17.8h, v23.8h, v18.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
+ "zip2 v16.8h, v20.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v23.8h, v20.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v19.8h, v25.8h, v19.8h\n"
+ "zip2 v17.8h, v23.8h, v18.8h\n"
+ "zip1 v16.8h, v19.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"8:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
index 35dc3dc0d4..b710861417 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -96,72 +96,72 @@ void interleave_block<8, 1, VLType::None, true>(
"movi v1.8h, #0x0\n"
"4:" // no_accumulate_16
"ldr d30, [x27], #0x8\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "sshll v30.8h, v30.8b, #0x0\n"
"ldr d29, [x26], #0x8\n"
+ "add x19, x19, #0x1\n"
+ "sshll v29.8h, v29.8b, #0x0\n"
"ldr d28, [x25], #0x8\n"
- "prfm pldl1keep, [x26, #0x70]\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "sshll v28.8h, v28.8b, #0x0\n"
"ldr d27, [x24], #0x8\n"
+ "cmp %x[width], #0x8\n"
+ "sshll v27.8h, v27.8b, #0x0\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "sshll v24.8h, v24.8b, #0x0\n"
+ "ldr d21, [x21], #0x8\n"
+ "sshll v23.8h, v23.8b, #0x0\n"
+ "ldr d26, [x20], #0x8\n"
+ "zip1 v20.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "zip1 v25.8h, v29.8h, v23.8h\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr d23, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
+ "zip2 v23.8h, v29.8h, v23.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr d26, [x21], #0x8\n"
- "ldr d25, [x20], #0x8\n"
+ "sshll v21.8h, v21.8b, #0x0\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v19.8h, v28.8h, v21.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "sshll v30.8h, v30.8b, #0x0\n"
- "sshll v29.8h, v29.8b, #0x0\n"
+ "zip1 v22.8h, v20.8h, v19.8h\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "sshll v28.8h, v28.8b, #0x0\n"
+ "zip2 v19.8h, v20.8h, v19.8h\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "sshll v27.8h, v27.8b, #0x0\n"
- "sshll v23.8h, v23.8b, #0x0\n"
- "zip1 v24.8h, v30.8h, v23.8h\n"
- "sshll v21.8h, v21.8b, #0x0\n"
- "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v20.8h, v28.8h, v21.8h\n"
+ "zip1 v21.8h, v24.8h, v20.8h\n"
+ "zip2 v20.8h, v24.8h, v20.8h\n"
"sshll v26.8h, v26.8b, #0x0\n"
- "sshll v25.8h, v25.8b, #0x0\n"
- "zip1 v22.8h, v29.8h, v21.8h\n"
- "add x19, x19, #0x1\n"
- "zip2 v21.8h, v29.8h, v21.8h\n"
- "subs %x[width], %x[width], #0x8\n"
- "zip1 v20.8h, v28.8h, v26.8h\n"
- "cmp %x[width], #0x8\n"
- "zip1 v18.8h, v24.8h, v20.8h\n"
- "zip1 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v18.8h, v27.8h, v26.8h\n"
+ "zip1 v17.8h, v25.8h, v18.8h\n"
+ "zip1 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v18.8h, v24.8h, v20.8h\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x20]\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v22.8h, v17.8h\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.8h, v25.8h, v18.8h\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "zip1 v17.8h, v19.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.8h, v19.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "zip2 v20.8h, v28.8h, v26.8h\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "zip2 v19.8h, v27.8h, v26.8h\n"
+ "zip1 v17.8h, v23.8h, v19.8h\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip1 v18.8h, v23.8h, v20.8h\n"
- "zip2 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x40]\n"
+ "zip2 v18.8h, v21.8h, v17.8h\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v18.8h, v23.8h, v20.8h\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x60]\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v23.8h, v19.8h\n"
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
+ "add v1.8h, v1.8h, v18.8h\n"
+ "zip2 v16.8h, v20.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"add v1.8h, v1.8h, v16.8h\n"
"bge 3b\n"
"5:" // Main loop skip
@@ -171,148 +171,148 @@ void interleave_block<8, 1, VLType::None, true>(
"ldr s29, [x26], #0x4\n"
"ldr s28, [x25], #0x4\n"
"ldr s27, [x24], #0x4\n"
- "ldr s23, [x23], #0x4\n"
- "ldr s21, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
- "ldr s25, [x20], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "ldr s21, [x21], #0x4\n"
+ "ldr s26, [x20], #0x4\n"
"tbz %x[width], #1, 6f\n"
"ld1 { v30.h }[2], [x27], #0x2\n"
+ "mov x19, #0x6\n"
"ld1 { v29.h }[2], [x26], #0x2\n"
"ld1 { v28.h }[2], [x25], #0x2\n"
"ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "ld1 { v21.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "mov x19, #0x6\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v21.h }[2], [x21], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.b }[6], [x27]\n"
+ "mov x19, #0x7\n"
"ld1 { v29.b }[6], [x26]\n"
"ld1 { v28.b }[6], [x25]\n"
"ld1 { v27.b }[6], [x24]\n"
- "ld1 { v23.b }[6], [x23]\n"
- "ld1 { v21.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v25.b }[6], [x20]\n"
- "mov x19, #0x7\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v21.b }[6], [x21]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 9f\n"
"6:" // odd_loads_1_4
"mov x19, #0x4\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.b }[4], [x27]\n"
"ld1 { v29.b }[4], [x26]\n"
+ "mov x19, #0x5\n"
"ld1 { v28.b }[4], [x25]\n"
"ld1 { v27.b }[4], [x24]\n"
- "ld1 { v23.b }[4], [x23]\n"
- "ld1 { v21.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v25.b }[4], [x20]\n"
- "mov x19, #0x5\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v21.b }[4], [x21]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 9f\n"
"7:" // odd_loads_2_0
"tbz %x[width], #1, 8f\n"
"ldr h30, [x27], #0x2\n"
"ldr h29, [x26], #0x2\n"
+ "mov x19, #0x2\n"
"ldr h28, [x25], #0x2\n"
"ldr h27, [x24], #0x2\n"
- "ldr h23, [x23], #0x2\n"
- "ldr h21, [x22], #0x2\n"
- "ldr h26, [x21], #0x2\n"
- "ldr h25, [x20], #0x2\n"
- "mov x19, #0x2\n"
+ "ldr h24, [x23], #0x2\n"
+ "ldr h23, [x22], #0x2\n"
+ "ldr h21, [x21], #0x2\n"
+ "ldr h26, [x20], #0x2\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.b }[2], [x27]\n"
+ "mov x19, #0x3\n"
"ld1 { v29.b }[2], [x26]\n"
"ld1 { v28.b }[2], [x25]\n"
"ld1 { v27.b }[2], [x24]\n"
- "ld1 { v23.b }[2], [x23]\n"
- "ld1 { v21.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v25.b }[2], [x20]\n"
- "mov x19, #0x3\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v21.b }[2], [x21]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 9f\n"
"8:" // odd_loads_1_0
"ldr b30, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr b29, [x26, #0x0]\n"
"ldr b28, [x25, #0x0]\n"
"ldr b27, [x24, #0x0]\n"
- "ldr b23, [x23, #0x0]\n"
- "ldr b21, [x22, #0x0]\n"
- "ldr b26, [x21, #0x0]\n"
- "ldr b25, [x20, #0x0]\n"
- "mov x19, #0x1\n"
+ "ldr b24, [x23, #0x0]\n"
+ "ldr b23, [x22, #0x0]\n"
+ "ldr b21, [x21, #0x0]\n"
+ "ldr b26, [x20, #0x0]\n"
"9:" // Odd load end
"sshll v30.8h, v30.8b, #0x0\n"
+ "subs x19, x19, #0x1\n"
"sshll v29.8h, v29.8b, #0x0\n"
"sshll v28.8h, v28.8b, #0x0\n"
"sshll v27.8h, v27.8b, #0x0\n"
+ "sshll v24.8h, v24.8b, #0x0\n"
+ "zip1 v20.8h, v30.8h, v24.8h\n"
"sshll v23.8h, v23.8b, #0x0\n"
- "zip1 v24.8h, v30.8h, v23.8h\n"
+ "zip1 v25.8h, v29.8h, v23.8h\n"
"sshll v21.8h, v21.8b, #0x0\n"
+ "zip1 v19.8h, v28.8h, v21.8h\n"
+ "zip1 v22.8h, v20.8h, v19.8h\n"
"sshll v26.8h, v26.8b, #0x0\n"
- "zip1 v20.8h, v28.8h, v26.8h\n"
- "sshll v25.8h, v25.8b, #0x0\n"
- "zip1 v22.8h, v29.8h, v21.8h\n"
- "subs x19, x19, #0x1\n"
- "zip1 v18.8h, v24.8h, v20.8h\n"
- "zip1 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v18.8h, v27.8h, v26.8h\n"
+ "zip1 v17.8h, v25.8h, v18.8h\n"
+ "zip1 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"beq 10f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v22.8h, v17.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v24.8h, v20.8h\n"
- "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip2 v19.8h, v20.8h, v19.8h\n"
+ "zip2 v16.8h, v25.8h, v18.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
+ "zip1 v17.8h, v19.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"beq 10f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v19.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
"add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v23.8h, v30.8h, v23.8h\n"
- "zip2 v20.8h, v28.8h, v26.8h\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip2 v20.8h, v28.8h, v21.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v23.8h, v20.8h\n"
- "zip2 v21.8h, v29.8h, v21.8h\n"
- "zip2 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v21.8h, v24.8h, v20.8h\n"
+ "zip2 v23.8h, v29.8h, v23.8h\n"
+ "zip2 v19.8h, v27.8h, v26.8h\n"
+ "zip1 v17.8h, v23.8h, v19.8h\n"
+ "zip1 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"beq 10f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v18.8h, v21.8h, v17.8h\n"
+ "str q18, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v18.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v23.8h, v20.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v20.8h, v24.8h, v20.8h\n"
+ "zip2 v16.8h, v23.8h, v19.8h\n"
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"10:" // Odds skip
"saddw v0.4s, v0.4s, v1.4h\n"
"str q0, [%x[out_ptr], #0x0]\n"
"saddw2 v31.4s, v31.4s, v1.8h\n"
"str q31, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
index bfa8989a4d..24ece9a68e 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -96,58 +96,58 @@ void interleave_block<8, 1, VLType::None, true>(
"movi v1.8h, #0x0\n"
"4:" // no_accumulate_16
"ldr q30, [x27], #0x10\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "add x19, x19, #0x1\n"
"ldr q29, [x26], #0x10\n"
+ "subs %x[width], %x[width], #0x8\n"
"ldr q28, [x25], #0x10\n"
- "prfm pldl1keep, [x26, #0x70]\n"
+ "cmp %x[width], #0x8\n"
"ldr q27, [x24], #0x10\n"
+ "ldr q25, [x23], #0x10\n"
+ "zip1 v26.8h, v30.8h, v25.8h\n"
+ "ldr q21, [x22], #0x10\n"
+ "zip2 v25.8h, v30.8h, v25.8h\n"
+ "ldr q24, [x21], #0x10\n"
+ "ldr q23, [x20], #0x10\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "zip1 v20.8h, v28.8h, v24.8h\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr q24, [x23], #0x10\n"
- "zip1 v26.8h, v30.8h, v24.8h\n"
+ "zip1 v18.8h, v26.8h, v20.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr q25, [x22], #0x10\n"
- "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q23, [x21], #0x10\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
+ "zip1 v16.8h, v22.8h, v19.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "ldr q22, [x20], #0x10\n"
- "zip1 v18.8h, v28.8h, v23.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "add x19, x19, #0x1\n"
- "zip1 v20.8h, v26.8h, v18.8h\n"
- "prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v19.8h, v27.8h, v22.8h\n"
- "subs %x[width], %x[width], #0x8\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "cmp %x[width], #0x8\n"
- "zip2 v18.8h, v26.8h, v18.8h\n"
- "zip1 v16.8h, v20.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v17.8h, v20.8h, v17.8h\n"
- "str q17, [%x[out_ptr], #0x10]\n"
- "zip2 v16.8h, v21.8h, v19.8h\n"
"add v1.8h, v1.8h, v17.8h\n"
- "zip1 v17.8h, v18.8h, v16.8h\n"
- "str q17, [%x[out_ptr], #0x20]\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
"zip2 v16.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.8h, v26.8h, v20.8h\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "add v1.8h, v1.8h, v17.8h\n"
- "zip2 v21.8h, v28.8h, v23.8h\n"
- "zip1 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v20.8h, v28.8h, v24.8h\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip2 v20.8h, v29.8h, v25.8h\n"
- "zip2 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x40]\n"
"add v1.8h, v1.8h, v16.8h\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v18.8h, v25.8h, v20.8h\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x60]\n"
"add v1.8h, v1.8h, v16.8h\n"
@@ -163,140 +163,140 @@ void interleave_block<8, 1, VLType::None, true>(
"ldr d29, [x26], #0x8\n"
"ldr d28, [x25], #0x8\n"
"ldr d27, [x24], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d25, [x22], #0x8\n"
- "ldr d23, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "ldr d23, [x20], #0x8\n"
"tbz %x[width], #1, 6f\n"
"ld1 { v30.s }[2], [x27], #0x4\n"
+ "mov x19, #0x6\n"
"ld1 { v29.s }[2], [x26], #0x4\n"
"ld1 { v28.s }[2], [x25], #0x4\n"
"ld1 { v27.s }[2], [x24], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v25.s }[2], [x22], #0x4\n"
- "ld1 { v23.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
- "mov x19, #0x6\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v24.s }[2], [x21], #0x4\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.h }[6], [x27]\n"
+ "mov x19, #0x7\n"
"ld1 { v29.h }[6], [x26]\n"
"ld1 { v28.h }[6], [x25]\n"
"ld1 { v27.h }[6], [x24]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "ld1 { v25.h }[6], [x22]\n"
- "ld1 { v23.h }[6], [x21]\n"
- "ld1 { v22.h }[6], [x20]\n"
- "mov x19, #0x7\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v21.h }[6], [x22]\n"
+ "ld1 { v24.h }[6], [x21]\n"
+ "ld1 { v23.h }[6], [x20]\n"
"b 9f\n"
"6:" // odd_loads_1_4
"mov x19, #0x4\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.h }[4], [x27]\n"
"ld1 { v29.h }[4], [x26]\n"
+ "mov x19, #0x5\n"
"ld1 { v28.h }[4], [x25]\n"
"ld1 { v27.h }[4], [x24]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "ld1 { v25.h }[4], [x22]\n"
- "ld1 { v23.h }[4], [x21]\n"
- "ld1 { v22.h }[4], [x20]\n"
- "mov x19, #0x5\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v21.h }[4], [x22]\n"
+ "ld1 { v24.h }[4], [x21]\n"
+ "ld1 { v23.h }[4], [x20]\n"
"b 9f\n"
"7:" // odd_loads_2_0
"tbz %x[width], #1, 8f\n"
"ldr s30, [x27], #0x4\n"
"ldr s29, [x26], #0x4\n"
+ "mov x19, #0x2\n"
"ldr s28, [x25], #0x4\n"
"ldr s27, [x24], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s25, [x22], #0x4\n"
- "ldr s23, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
- "mov x19, #0x2\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s24, [x21], #0x4\n"
+ "ldr s23, [x20], #0x4\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.h }[2], [x27]\n"
+ "mov x19, #0x3\n"
"ld1 { v29.h }[2], [x26]\n"
"ld1 { v28.h }[2], [x25]\n"
"ld1 { v27.h }[2], [x24]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v25.h }[2], [x22]\n"
- "ld1 { v23.h }[2], [x21]\n"
- "ld1 { v22.h }[2], [x20]\n"
- "mov x19, #0x3\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v21.h }[2], [x22]\n"
+ "ld1 { v24.h }[2], [x21]\n"
+ "ld1 { v23.h }[2], [x20]\n"
"b 9f\n"
"8:" // odd_loads_1_0
"ldr h30, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr h29, [x26, #0x0]\n"
"ldr h28, [x25, #0x0]\n"
"ldr h27, [x24, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h25, [x22, #0x0]\n"
- "ldr h23, [x21, #0x0]\n"
- "ldr h22, [x20, #0x0]\n"
- "mov x19, #0x1\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h21, [x22, #0x0]\n"
+ "ldr h24, [x21, #0x0]\n"
+ "ldr h23, [x20, #0x0]\n"
"9:" // Odd load end
- "zip1 v26.8h, v30.8h, v24.8h\n"
+ "zip1 v26.8h, v30.8h, v25.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v28.8h, v23.8h\n"
- "zip1 v20.8h, v26.8h, v18.8h\n"
- "zip1 v21.8h, v29.8h, v25.8h\n"
- "zip1 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v20.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
+ "zip1 v20.8h, v28.8h, v24.8h\n"
+ "zip1 v18.8h, v26.8h, v20.8h\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "zip1 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v16.8h, v22.8h, v19.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"beq 10f\n"
- "zip2 v17.8h, v20.8h, v17.8h\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
- "add v1.8h, v1.8h, v17.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v26.8h, v18.8h\n"
- "zip2 v16.8h, v21.8h, v19.8h\n"
+ "zip2 v18.8h, v26.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v17.8h, v18.8h, v16.8h\n"
- "str q17, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v17.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"beq 10f\n"
- "zip2 v16.8h, v18.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
"add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v24.8h, v30.8h, v24.8h\n"
- "zip2 v21.8h, v28.8h, v23.8h\n"
+ "zip2 v25.8h, v30.8h, v25.8h\n"
+ "zip2 v20.8h, v28.8h, v24.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v24.8h, v21.8h\n"
- "zip2 v20.8h, v29.8h, v25.8h\n"
- "zip2 v19.8h, v27.8h, v22.8h\n"
- "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "zip2 v19.8h, v27.8h, v23.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"beq 10f\n"
"zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
"add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v24.8h, v21.8h\n"
- "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip2 v18.8h, v25.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
"zip1 v16.8h, v18.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"10:" // Odds skip
"uaddw v0.4s, v0.4s, v1.4h\n"
"str q0, [%x[out_ptr], #0x0]\n"
"uaddw2 v31.4s, v31.4s, v1.8h\n"
"str q31, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
index 86b90f1898..0db2f7fd51 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -79,206 +79,206 @@ void interleave_block<8, 1, VLType::None, false>(
"prfm pldl1keep, [x20, #0x40]\n"
"blt 3f\n"
"2:" // Main loop head
- "ldr d30, [x27], #0x8\n"
+ "ldr d31, [x27], #0x8\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr d30, [x26], #0x8\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr d29, [x25], #0x8\n"
+ "cmp %x[width], #0x8\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr d28, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr d23, [x22], #0x8\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr d27, [x21], #0x8\n"
+ "zip1 v20.8h, v31.8h, v25.8h\n"
+ "ldr d26, [x20], #0x8\n"
+ "zip2 v25.8h, v31.8h, v25.8h\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ldr d29, [x26], #0x8\n"
- "ldr d28, [x25], #0x8\n"
"prfm pldl1keep, [x26, #0x70]\n"
- "ldr d27, [x24], #0x8\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr d23, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr d26, [x21], #0x8\n"
- "ldr d25, [x20], #0x8\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v19.8h, v29.8h, v27.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
+ "zip1 v22.8h, v20.8h, v19.8h\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "zip2 v21.8h, v20.8h, v19.8h\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "zip1 v24.8h, v30.8h, v23.8h\n"
- "ushll v21.8h, v21.8b, #0x0\n"
- "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v19.8h, v29.8h, v27.8h\n"
+ "zip1 v20.8h, v25.8h, v19.8h\n"
+ "zip2 v19.8h, v25.8h, v19.8h\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "zip1 v22.8h, v29.8h, v21.8h\n"
- "subs %x[width], %x[width], #0x8\n"
- "zip2 v21.8h, v29.8h, v21.8h\n"
- "cmp %x[width], #0x8\n"
- "zip1 v20.8h, v28.8h, v26.8h\n"
- "zip1 v18.8h, v24.8h, v20.8h\n"
- "zip1 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v17.8h, v24.8h, v18.8h\n"
+ "zip1 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v18.8h, v24.8h, v20.8h\n"
- "zip2 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v24.8h, v18.8h\n"
+ "zip1 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x20]\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "zip2 v20.8h, v28.8h, v26.8h\n"
- "zip1 v18.8h, v23.8h, v20.8h\n"
- "zip2 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v17.8h, v23.8h, v18.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x40]\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v20.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v18.8h, v23.8h, v20.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v23.8h, v18.8h\n"
+ "zip1 v16.8h, v19.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x60]\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v19.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 2b\n"
"3:" // Main loop skip
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
- "ldr s30, [x27], #0x4\n"
- "ldr s29, [x26], #0x4\n"
- "ldr s28, [x25], #0x4\n"
- "ldr s27, [x24], #0x4\n"
- "ldr s23, [x23], #0x4\n"
- "ldr s21, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
- "ldr s25, [x20], #0x4\n"
+ "ldr s31, [x27], #0x4\n"
+ "ldr s30, [x26], #0x4\n"
+ "ldr s29, [x25], #0x4\n"
+ "ldr s28, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "ldr s27, [x21], #0x4\n"
+ "ldr s26, [x20], #0x4\n"
"tbz %x[width], #1, 4f\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
- "ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x25], #0x2\n"
- "ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "ld1 { v21.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v31.h }[2], [x27], #0x2\n"
"mov x19, #0x6\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v29.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v30.b }[6], [x27]\n"
- "ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x25]\n"
- "ld1 { v27.b }[6], [x24]\n"
- "ld1 { v23.b }[6], [x23]\n"
- "ld1 { v21.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v31.b }[6], [x27]\n"
"mov x19, #0x7\n"
+ "ld1 { v30.b }[6], [x26]\n"
+ "ld1 { v29.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x19, #0x4\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v30.b }[4], [x27]\n"
- "ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x25]\n"
- "ld1 { v27.b }[4], [x24]\n"
- "ld1 { v23.b }[4], [x23]\n"
- "ld1 { v21.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v31.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x26]\n"
"mov x19, #0x5\n"
+ "ld1 { v29.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
- "ldr h30, [x27], #0x2\n"
- "ldr h29, [x26], #0x2\n"
- "ldr h28, [x25], #0x2\n"
- "ldr h27, [x24], #0x2\n"
- "ldr h23, [x23], #0x2\n"
- "ldr h21, [x22], #0x2\n"
- "ldr h26, [x21], #0x2\n"
- "ldr h25, [x20], #0x2\n"
+ "ldr h31, [x27], #0x2\n"
+ "ldr h30, [x26], #0x2\n"
"mov x19, #0x2\n"
+ "ldr h29, [x25], #0x2\n"
+ "ldr h28, [x24], #0x2\n"
+ "ldr h25, [x23], #0x2\n"
+ "ldr h23, [x22], #0x2\n"
+ "ldr h27, [x21], #0x2\n"
+ "ldr h26, [x20], #0x2\n"
"tbz %x[width], #0, 7f\n"
- "ld1 { v30.b }[2], [x27]\n"
- "ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x25]\n"
- "ld1 { v27.b }[2], [x24]\n"
- "ld1 { v23.b }[2], [x23]\n"
- "ld1 { v21.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v31.b }[2], [x27]\n"
"mov x19, #0x3\n"
+ "ld1 { v30.b }[2], [x26]\n"
+ "ld1 { v29.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 7f\n"
"6:" // odd_loads_1_0
- "ldr b30, [x27, #0x0]\n"
- "ldr b29, [x26, #0x0]\n"
- "ldr b28, [x25, #0x0]\n"
- "ldr b27, [x24, #0x0]\n"
- "ldr b23, [x23, #0x0]\n"
- "ldr b21, [x22, #0x0]\n"
- "ldr b26, [x21, #0x0]\n"
- "ldr b25, [x20, #0x0]\n"
+ "ldr b31, [x27, #0x0]\n"
"mov x19, #0x1\n"
+ "ldr b30, [x26, #0x0]\n"
+ "ldr b29, [x25, #0x0]\n"
+ "ldr b28, [x24, #0x0]\n"
+ "ldr b25, [x23, #0x0]\n"
+ "ldr b23, [x22, #0x0]\n"
+ "ldr b27, [x21, #0x0]\n"
+ "ldr b26, [x20, #0x0]\n"
"7:" // Odd load end
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "subs x19, x19, #0x1\n"
"ushll v30.8h, v30.8b, #0x0\n"
"ushll v29.8h, v29.8b, #0x0\n"
"ushll v28.8h, v28.8b, #0x0\n"
- "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "zip1 v20.8h, v31.8h, v25.8h\n"
"ushll v23.8h, v23.8b, #0x0\n"
"zip1 v24.8h, v30.8h, v23.8h\n"
- "ushll v21.8h, v21.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "zip1 v19.8h, v29.8h, v27.8h\n"
+ "zip1 v22.8h, v20.8h, v19.8h\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "zip1 v20.8h, v28.8h, v26.8h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "zip1 v22.8h, v29.8h, v21.8h\n"
- "subs x19, x19, #0x1\n"
- "zip1 v18.8h, v24.8h, v20.8h\n"
- "zip1 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v17.8h, v24.8h, v18.8h\n"
+ "zip1 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
+ "zip2 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v24.8h, v20.8h\n"
- "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip2 v21.8h, v20.8h, v19.8h\n"
+ "zip2 v17.8h, v24.8h, v18.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
+ "zip2 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v23.8h, v30.8h, v23.8h\n"
- "zip2 v20.8h, v28.8h, v26.8h\n"
+ "zip2 v25.8h, v31.8h, v25.8h\n"
+ "zip2 v19.8h, v29.8h, v27.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v23.8h, v20.8h\n"
- "zip2 v21.8h, v29.8h, v21.8h\n"
- "zip2 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v20.8h, v25.8h, v19.8h\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v18.8h, v28.8h, v26.8h\n"
+ "zip1 v17.8h, v23.8h, v18.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "subs x19, x19, #0x1\n"
+ "zip2 v16.8h, v20.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 8f\n"
- "zip2 v18.8h, v23.8h, v20.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v19.8h, v25.8h, v19.8h\n"
+ "zip2 v17.8h, v23.8h, v18.8h\n"
+ "zip1 v16.8h, v19.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"8:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
index cefb70c57b..7c7d774a6b 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -96,72 +96,72 @@ void interleave_block<8, 1, VLType::None, true>(
"movi v1.8h, #0x0\n"
"4:" // no_accumulate_16
"ldr d30, [x27], #0x8\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
"ldr d29, [x26], #0x8\n"
+ "add x19, x19, #0x1\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
"ldr d28, [x25], #0x8\n"
- "prfm pldl1keep, [x26, #0x70]\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
"ldr d27, [x24], #0x8\n"
+ "cmp %x[width], #0x8\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr d21, [x21], #0x8\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr d26, [x20], #0x8\n"
+ "zip1 v20.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "zip1 v25.8h, v29.8h, v23.8h\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr d23, [x23], #0x8\n"
- "ldr d21, [x22], #0x8\n"
+ "zip2 v23.8h, v29.8h, v23.8h\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr d26, [x21], #0x8\n"
- "ldr d25, [x20], #0x8\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
"prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v19.8h, v28.8h, v21.8h\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "ushll v30.8h, v30.8b, #0x0\n"
- "ushll v29.8h, v29.8b, #0x0\n"
+ "zip1 v22.8h, v20.8h, v19.8h\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "ushll v28.8h, v28.8b, #0x0\n"
+ "zip2 v19.8h, v20.8h, v19.8h\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "ushll v27.8h, v27.8b, #0x0\n"
- "ushll v23.8h, v23.8b, #0x0\n"
- "zip1 v24.8h, v30.8h, v23.8h\n"
- "ushll v21.8h, v21.8b, #0x0\n"
- "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v20.8h, v28.8h, v21.8h\n"
+ "zip1 v21.8h, v24.8h, v20.8h\n"
+ "zip2 v20.8h, v24.8h, v20.8h\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "zip1 v22.8h, v29.8h, v21.8h\n"
- "add x19, x19, #0x1\n"
- "zip2 v21.8h, v29.8h, v21.8h\n"
- "subs %x[width], %x[width], #0x8\n"
- "zip1 v20.8h, v28.8h, v26.8h\n"
- "cmp %x[width], #0x8\n"
- "zip1 v18.8h, v24.8h, v20.8h\n"
- "zip1 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v18.8h, v27.8h, v26.8h\n"
+ "zip1 v17.8h, v25.8h, v18.8h\n"
+ "zip1 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v18.8h, v24.8h, v20.8h\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x20]\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v22.8h, v17.8h\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.8h, v25.8h, v18.8h\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "zip1 v17.8h, v19.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.8h, v19.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x30]\n"
- "zip2 v20.8h, v28.8h, v26.8h\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "zip2 v19.8h, v27.8h, v26.8h\n"
+ "zip1 v17.8h, v23.8h, v19.8h\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip1 v18.8h, v23.8h, v20.8h\n"
- "zip2 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x40]\n"
+ "zip2 v18.8h, v21.8h, v17.8h\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
"add v1.8h, v1.8h, v16.8h\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v18.8h, v23.8h, v20.8h\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x60]\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v23.8h, v19.8h\n"
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
+ "add v1.8h, v1.8h, v18.8h\n"
+ "zip2 v16.8h, v20.8h, v16.8h\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"add v1.8h, v1.8h, v16.8h\n"
"bge 3b\n"
"5:" // Main loop skip
@@ -171,148 +171,148 @@ void interleave_block<8, 1, VLType::None, true>(
"ldr s29, [x26], #0x4\n"
"ldr s28, [x25], #0x4\n"
"ldr s27, [x24], #0x4\n"
- "ldr s23, [x23], #0x4\n"
- "ldr s21, [x22], #0x4\n"
- "ldr s26, [x21], #0x4\n"
- "ldr s25, [x20], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "ldr s21, [x21], #0x4\n"
+ "ldr s26, [x20], #0x4\n"
"tbz %x[width], #1, 6f\n"
"ld1 { v30.h }[2], [x27], #0x2\n"
+ "mov x19, #0x6\n"
"ld1 { v29.h }[2], [x26], #0x2\n"
"ld1 { v28.h }[2], [x25], #0x2\n"
"ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "ld1 { v21.h }[2], [x22], #0x2\n"
- "ld1 { v26.h }[2], [x21], #0x2\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "mov x19, #0x6\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "ld1 { v23.h }[2], [x22], #0x2\n"
+ "ld1 { v21.h }[2], [x21], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.b }[6], [x27]\n"
+ "mov x19, #0x7\n"
"ld1 { v29.b }[6], [x26]\n"
"ld1 { v28.b }[6], [x25]\n"
"ld1 { v27.b }[6], [x24]\n"
- "ld1 { v23.b }[6], [x23]\n"
- "ld1 { v21.b }[6], [x22]\n"
- "ld1 { v26.b }[6], [x21]\n"
- "ld1 { v25.b }[6], [x20]\n"
- "mov x19, #0x7\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "ld1 { v23.b }[6], [x22]\n"
+ "ld1 { v21.b }[6], [x21]\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 9f\n"
"6:" // odd_loads_1_4
"mov x19, #0x4\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.b }[4], [x27]\n"
"ld1 { v29.b }[4], [x26]\n"
+ "mov x19, #0x5\n"
"ld1 { v28.b }[4], [x25]\n"
"ld1 { v27.b }[4], [x24]\n"
- "ld1 { v23.b }[4], [x23]\n"
- "ld1 { v21.b }[4], [x22]\n"
- "ld1 { v26.b }[4], [x21]\n"
- "ld1 { v25.b }[4], [x20]\n"
- "mov x19, #0x5\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "ld1 { v23.b }[4], [x22]\n"
+ "ld1 { v21.b }[4], [x21]\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 9f\n"
"7:" // odd_loads_2_0
"tbz %x[width], #1, 8f\n"
"ldr h30, [x27], #0x2\n"
"ldr h29, [x26], #0x2\n"
+ "mov x19, #0x2\n"
"ldr h28, [x25], #0x2\n"
"ldr h27, [x24], #0x2\n"
- "ldr h23, [x23], #0x2\n"
- "ldr h21, [x22], #0x2\n"
- "ldr h26, [x21], #0x2\n"
- "ldr h25, [x20], #0x2\n"
- "mov x19, #0x2\n"
+ "ldr h24, [x23], #0x2\n"
+ "ldr h23, [x22], #0x2\n"
+ "ldr h21, [x21], #0x2\n"
+ "ldr h26, [x20], #0x2\n"
"tbz %x[width], #0, 9f\n"
"ld1 { v30.b }[2], [x27]\n"
+ "mov x19, #0x3\n"
"ld1 { v29.b }[2], [x26]\n"
"ld1 { v28.b }[2], [x25]\n"
"ld1 { v27.b }[2], [x24]\n"
- "ld1 { v23.b }[2], [x23]\n"
- "ld1 { v21.b }[2], [x22]\n"
- "ld1 { v26.b }[2], [x21]\n"
- "ld1 { v25.b }[2], [x20]\n"
- "mov x19, #0x3\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "ld1 { v23.b }[2], [x22]\n"
+ "ld1 { v21.b }[2], [x21]\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 9f\n"
"8:" // odd_loads_1_0
"ldr b30, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr b29, [x26, #0x0]\n"
"ldr b28, [x25, #0x0]\n"
"ldr b27, [x24, #0x0]\n"
- "ldr b23, [x23, #0x0]\n"
- "ldr b21, [x22, #0x0]\n"
- "ldr b26, [x21, #0x0]\n"
- "ldr b25, [x20, #0x0]\n"
- "mov x19, #0x1\n"
+ "ldr b24, [x23, #0x0]\n"
+ "ldr b23, [x22, #0x0]\n"
+ "ldr b21, [x21, #0x0]\n"
+ "ldr b26, [x20, #0x0]\n"
"9:" // Odd load end
"ushll v30.8h, v30.8b, #0x0\n"
+ "subs x19, x19, #0x1\n"
"ushll v29.8h, v29.8b, #0x0\n"
"ushll v28.8h, v28.8b, #0x0\n"
"ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "zip1 v20.8h, v30.8h, v24.8h\n"
"ushll v23.8h, v23.8b, #0x0\n"
- "zip1 v24.8h, v30.8h, v23.8h\n"
+ "zip1 v25.8h, v29.8h, v23.8h\n"
"ushll v21.8h, v21.8b, #0x0\n"
+ "zip1 v19.8h, v28.8h, v21.8h\n"
+ "zip1 v22.8h, v20.8h, v19.8h\n"
"ushll v26.8h, v26.8b, #0x0\n"
- "zip1 v20.8h, v28.8h, v26.8h\n"
- "ushll v25.8h, v25.8b, #0x0\n"
- "zip1 v22.8h, v29.8h, v21.8h\n"
- "subs x19, x19, #0x1\n"
- "zip1 v18.8h, v24.8h, v20.8h\n"
- "zip1 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v22.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v18.8h, v27.8h, v26.8h\n"
+ "zip1 v17.8h, v25.8h, v18.8h\n"
+ "zip1 v16.8h, v22.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"beq 10f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v17.8h, v22.8h, v17.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v24.8h, v20.8h\n"
- "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip2 v19.8h, v20.8h, v19.8h\n"
+ "zip2 v16.8h, v25.8h, v18.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
+ "zip1 v17.8h, v19.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"beq 10f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v16.8h, v19.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
"add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v23.8h, v30.8h, v23.8h\n"
- "zip2 v20.8h, v28.8h, v26.8h\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip2 v20.8h, v28.8h, v21.8h\n"
"subs x19, x19, #0x1\n"
- "zip1 v18.8h, v23.8h, v20.8h\n"
- "zip2 v21.8h, v29.8h, v21.8h\n"
- "zip2 v19.8h, v27.8h, v25.8h\n"
- "zip1 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
+ "zip1 v21.8h, v24.8h, v20.8h\n"
+ "zip2 v23.8h, v29.8h, v23.8h\n"
+ "zip2 v19.8h, v27.8h, v26.8h\n"
+ "zip1 v17.8h, v23.8h, v19.8h\n"
+ "zip1 v16.8h, v21.8h, v17.8h\n"
"str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v16.8h\n"
"beq 10f\n"
- "zip2 v16.8h, v18.8h, v17.8h\n"
+ "zip2 v18.8h, v21.8h, v17.8h\n"
+ "str q18, [%x[out_ptr], #0x0]\n"
"subs x19, x19, #0x1\n"
- "add v1.8h, v1.8h, v16.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v18.8h\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
"beq 10f\n"
- "zip2 v18.8h, v23.8h, v20.8h\n"
- "zip2 v17.8h, v21.8h, v19.8h\n"
- "zip1 v16.8h, v18.8h, v17.8h\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v20.8h, v24.8h, v20.8h\n"
+ "zip2 v16.8h, v23.8h, v19.8h\n"
+ "zip1 v17.8h, v20.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
"add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "add v1.8h, v1.8h, v17.8h\n"
"10:" // Odds skip
"uaddw v0.4s, v0.4s, v1.4h\n"
"str q0, [%x[out_ptr], #0x0]\n"
"uaddw2 v31.4s, v31.4s, v1.8h\n"
"str q31, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
index 5377edc1e1..1e5d395667 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,45 +80,45 @@ void interleave_block<8, 2, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q28, [x27], #0x10\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "ldr q29, [x26], #0x10\n"
+ "cmp %x[width], #0x8\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "ldr q27, [x23], #0x10\n"
+ "ldr q26, [x22], #0x10\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
+ "ldr q24, [x20], #0x10\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ldr q27, [x26], #0x10\n"
- "ldr q26, [x25], #0x10\n"
- "zip1 v23.4s, v28.4s, v26.4s\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
- "ldr q22, [x24], #0x10\n"
- "zip2 v26.4s, v28.4s, v26.4s\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr q25, [x23], #0x10\n"
- "zip1 v20.4s, v27.4s, v22.4s\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr q24, [x22], #0x10\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q19, [x21], #0x10\n"
- "zip2 v23.4s, v23.4s, v20.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "zip2 v22.4s, v27.4s, v22.4s\n"
- "ldr q21, [x20], #0x10\n"
- "zip1 v18.4s, v25.4s, v19.4s\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v20.4s, v26.4s, v22.4s\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v16.4s, v24.4s, v21.4s\n"
- "subs %x[width], %x[width], #0x8\n"
- "zip1 v17.4s, v18.4s, v16.4s\n"
- "cmp %x[width], #0x8\n"
- "zip2 v16.4s, v18.4s, v16.4s\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
"str q17, [%x[out_ptr], #0x10]\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
- "str q23, [%x[out_ptr], #0x20]\n"
- "zip2 v18.4s, v24.4s, v21.4s\n"
- "str q16, [%x[out_ptr], #0x30]\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q20, [%x[out_ptr], #0x40]\n"
- "zip2 v17.4s, v26.4s, v22.4s\n"
- "str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v17.4s, v28.4s, v25.4s\n"
+ "str q22, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
+ "str q20, [%x[out_ptr], #0x30]\n"
+ "str q21, [%x[out_ptr], #0x40]\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
"str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
@@ -127,119 +127,119 @@ void interleave_block<8, 2, VLType::None, false>(
"cbz %x[width], 8f\n"
"tbz %x[width], #2, 5f\n"
"ldr d28, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d26, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d24, [x22], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"ldr d19, [x21], #0x8\n"
- "ldr d21, [x20], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
"tbz %x[width], #1, 4f\n"
"ld1 { v28.s }[2], [x27], #0x4\n"
- "ld1 { v27.s }[2], [x26], #0x4\n"
- "ld1 { v26.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
- "ld1 { v24.s }[2], [x22], #0x4\n"
- "ld1 { v19.s }[2], [x21], #0x4\n"
- "ld1 { v21.s }[2], [x20], #0x4\n"
"mov x19, #0x3\n"
+ "ld1 { v29.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v27.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v24.s }[2], [x20], #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v28.h }[6], [x27]\n"
- "ld1 { v27.h }[6], [x26]\n"
- "ld1 { v26.h }[6], [x25]\n"
- "ld1 { v22.h }[6], [x24]\n"
- "ld1 { v25.h }[6], [x23]\n"
- "ld1 { v24.h }[6], [x22]\n"
- "ld1 { v19.h }[6], [x21]\n"
- "ld1 { v21.h }[6], [x20]\n"
"mov x19, #0x4\n"
+ "ld1 { v29.h }[6], [x26]\n"
+ "ld1 { v25.h }[6], [x25]\n"
+ "ld1 { v21.h }[6], [x24]\n"
+ "ld1 { v27.h }[6], [x23]\n"
+ "ld1 { v26.h }[6], [x22]\n"
+ "ld1 { v19.h }[6], [x21]\n"
+ "ld1 { v24.h }[6], [x20]\n"
"b 7f\n"
"4:" // odd_loads_1_4
"mov x19, #0x2\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v28.h }[4], [x27]\n"
- "ld1 { v27.h }[4], [x26]\n"
- "ld1 { v26.h }[4], [x25]\n"
- "ld1 { v22.h }[4], [x24]\n"
- "ld1 { v25.h }[4], [x23]\n"
- "ld1 { v24.h }[4], [x22]\n"
- "ld1 { v19.h }[4], [x21]\n"
- "ld1 { v21.h }[4], [x20]\n"
+ "ld1 { v29.h }[4], [x26]\n"
"mov x19, #0x3\n"
+ "ld1 { v25.h }[4], [x25]\n"
+ "ld1 { v21.h }[4], [x24]\n"
+ "ld1 { v27.h }[4], [x23]\n"
+ "ld1 { v26.h }[4], [x22]\n"
+ "ld1 { v19.h }[4], [x21]\n"
+ "ld1 { v24.h }[4], [x20]\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
"ldr s28, [x27], #0x4\n"
- "ldr s27, [x26], #0x4\n"
- "ldr s26, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
- "ldr s24, [x22], #0x4\n"
- "ldr s19, [x21], #0x4\n"
- "ldr s21, [x20], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
"mov x19, #0x1\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s27, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
+ "ldr s19, [x21], #0x4\n"
+ "ldr s24, [x20], #0x4\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v28.h }[2], [x27]\n"
- "ld1 { v27.h }[2], [x26]\n"
- "ld1 { v26.h }[2], [x25]\n"
- "ld1 { v22.h }[2], [x24]\n"
- "ld1 { v25.h }[2], [x23]\n"
- "ld1 { v24.h }[2], [x22]\n"
- "ld1 { v19.h }[2], [x21]\n"
- "ld1 { v21.h }[2], [x20]\n"
"mov x19, #0x2\n"
+ "ld1 { v29.h }[2], [x26]\n"
+ "ld1 { v25.h }[2], [x25]\n"
+ "ld1 { v21.h }[2], [x24]\n"
+ "ld1 { v27.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
+ "ld1 { v19.h }[2], [x21]\n"
+ "ld1 { v24.h }[2], [x20]\n"
"b 7f\n"
"6:" // odd_loads_1_0
"ldr h28, [x27, #0x0]\n"
- "ldr h27, [x26, #0x0]\n"
- "ldr h26, [x25, #0x0]\n"
- "ldr h22, [x24, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
- "ldr h24, [x22, #0x0]\n"
- "ldr h19, [x21, #0x0]\n"
- "ldr h21, [x20, #0x0]\n"
"mov x19, #0x1\n"
+ "ldr h29, [x26, #0x0]\n"
+ "ldr h25, [x25, #0x0]\n"
+ "ldr h21, [x24, #0x0]\n"
+ "ldr h27, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
+ "ldr h19, [x21, #0x0]\n"
+ "ldr h24, [x20, #0x0]\n"
"7:" // Odd load end
- "zip1 v23.4s, v28.4s, v26.4s\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
"subs x19, x19, #0x1\n"
- "zip1 v20.4s, v27.4s, v22.4s\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.4s, v25.4s, v19.4s\n"
- "zip1 v16.4s, v24.4s, v21.4s\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
"zip1 v17.4s, v18.4s, v16.4s\n"
"str q17, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 8f\n"
- "zip2 v23.4s, v23.4s, v20.4s\n"
- "zip2 v16.4s, v18.4s, v16.4s\n"
- "str q23, [%x[out_ptr], #0x0]\n"
- "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
+ "str q22, [%x[out_ptr], #0x0]\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
"subs x19, x19, #0x1\n"
+ "str q20, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 8f\n"
- "zip2 v26.4s, v28.4s, v26.4s\n"
- "zip2 v22.4s, v27.4s, v22.4s\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
"subs x19, x19, #0x1\n"
- "zip1 v20.4s, v26.4s, v22.4s\n"
- "str q20, [%x[out_ptr], #0x0]\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
- "zip2 v18.4s, v24.4s, v21.4s\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q16, [%x[out_ptr], #0x10]\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
+ "str q21, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 8f\n"
- "zip2 v17.4s, v26.4s, v22.4s\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v17.4s, v28.4s, v25.4s\n"
"str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"8:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
index 3aea6a8999..064207c0fa 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,39 +80,39 @@ void interleave_block<8, 2, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q27, [x27], #0x10\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "subs %x[width], %x[width], #0x4\n"
"ldr q24, [x26], #0x10\n"
"zip1 v26.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x26, #0x70]\n"
"ldr q25, [x25], #0x10\n"
+ "cmp %x[width], #0x4\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x25, #0x70]\n"
"ldr q21, [x24], #0x10\n"
- "zip1 v23.2d, v25.2d, v21.2d\n"
- "prfm pldl1keep, [x24, #0x70]\n"
- "ldr q22, [x23], #0x10\n"
+ "ldr q23, [x23], #0x10\n"
+ "zip1 v22.2d, v25.2d, v21.2d\n"
+ "ldr q18, [x22], #0x10\n"
"zip2 v21.2d, v25.2d, v21.2d\n"
+ "ldr q20, [x21], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v19.2d, v23.2d, v18.2d\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "zip2 v18.2d, v23.2d, v18.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "zip1 v17.2d, v20.2d, v16.2d\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "zip2 v16.2d, v20.2d, v16.2d\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v20.2d, v22.2d, v18.2d\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "ldr q19, [x21], #0x10\n"
- "zip2 v18.2d, v22.2d, v18.2d\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "ldr q16, [x20], #0x10\n"
- "zip1 v17.2d, v19.2d, v16.2d\n"
"prfm pldl1keep, [x20, #0x70]\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
- "str q23, [%x[out_ptr], #0x10]\n"
- "str q20, [%x[out_ptr], #0x20]\n"
+ "str q22, [%x[out_ptr], #0x10]\n"
+ "str q19, [%x[out_ptr], #0x20]\n"
"str q17, [%x[out_ptr], #0x30]\n"
"str q24, [%x[out_ptr], #0x40]\n"
"str q21, [%x[out_ptr], #0x50]\n"
"str q18, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
- "subs %x[width], %x[width], #0x4\n"
- "cmp %x[width], #0x4\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 2b\n"
"3:" // Main loop skip
@@ -120,58 +120,58 @@ void interleave_block<8, 2, VLType::None, false>(
"tbz %x[width], #1, 4f\n"
"ldr d27, [x27], #0x8\n"
"ldr d24, [x26], #0x8\n"
+ "mov x19, #0x1\n"
"ldr d25, [x25], #0x8\n"
"ldr d21, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"ldr d18, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
"ldr d16, [x20], #0x8\n"
- "mov x19, #0x1\n"
"tbz %x[width], #0, 5f\n"
"ld1 { v27.s }[2], [x27]\n"
+ "mov x19, #0x2\n"
"ld1 { v24.s }[2], [x26]\n"
"ld1 { v25.s }[2], [x25]\n"
"ld1 { v21.s }[2], [x24]\n"
- "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x23]\n"
"ld1 { v18.s }[2], [x22]\n"
- "ld1 { v19.s }[2], [x21]\n"
+ "ld1 { v20.s }[2], [x21]\n"
"ld1 { v16.s }[2], [x20]\n"
- "mov x19, #0x2\n"
"b 5f\n"
"4:" // odd_loads_1_0
"ldr s27, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr s24, [x26, #0x0]\n"
"ldr s25, [x25, #0x0]\n"
"ldr s21, [x24, #0x0]\n"
- "ldr s22, [x23, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
"ldr s18, [x22, #0x0]\n"
- "ldr s19, [x21, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
"ldr s16, [x20, #0x0]\n"
- "mov x19, #0x1\n"
"5:" // Odd load end
"zip1 v26.2d, v27.2d, v24.2d\n"
- "subs x19, x19, #0x1\n"
- "zip1 v23.2d, v25.2d, v21.2d\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip1 v20.2d, v22.2d, v18.2d\n"
- "str q23, [%x[out_ptr], #0x10]\n"
- "zip1 v17.2d, v19.2d, v16.2d\n"
- "str q20, [%x[out_ptr], #0x20]\n"
+ "zip1 v22.2d, v25.2d, v21.2d\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v19.2d, v23.2d, v18.2d\n"
+ "str q22, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v20.2d, v16.2d\n"
+ "str q19, [%x[out_ptr], #0x20]\n"
"str q17, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 6f\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "zip2 v21.2d, v25.2d, v21.2d\n"
"str q24, [%x[out_ptr], #0x0]\n"
- "zip2 v18.2d, v22.2d, v18.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "zip2 v18.2d, v23.2d, v18.2d\n"
"str q21, [%x[out_ptr], #0x10]\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
+ "zip2 v16.2d, v20.2d, v16.2d\n"
"str q18, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"6:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
index 4780b77a4a..1f86722bc1 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,39 +80,39 @@ void interleave_block<8, 4, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q27, [x27], #0x10\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "subs %x[width], %x[width], #0x8\n"
"ldr q24, [x26], #0x10\n"
"zip1 v26.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x26, #0x70]\n"
"ldr q25, [x25], #0x10\n"
+ "cmp %x[width], #0x8\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x25, #0x70]\n"
"ldr q21, [x24], #0x10\n"
- "zip1 v23.2d, v25.2d, v21.2d\n"
- "prfm pldl1keep, [x24, #0x70]\n"
- "ldr q22, [x23], #0x10\n"
+ "ldr q23, [x23], #0x10\n"
+ "zip1 v22.2d, v25.2d, v21.2d\n"
+ "ldr q18, [x22], #0x10\n"
"zip2 v21.2d, v25.2d, v21.2d\n"
+ "ldr q20, [x21], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v19.2d, v23.2d, v18.2d\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "zip2 v18.2d, v23.2d, v18.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "zip1 v17.2d, v20.2d, v16.2d\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "zip2 v16.2d, v20.2d, v16.2d\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v20.2d, v22.2d, v18.2d\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "ldr q19, [x21], #0x10\n"
- "zip2 v18.2d, v22.2d, v18.2d\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "ldr q16, [x20], #0x10\n"
- "zip1 v17.2d, v19.2d, v16.2d\n"
"prfm pldl1keep, [x20, #0x70]\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
- "str q23, [%x[out_ptr], #0x10]\n"
- "str q20, [%x[out_ptr], #0x20]\n"
+ "str q22, [%x[out_ptr], #0x10]\n"
+ "str q19, [%x[out_ptr], #0x20]\n"
"str q17, [%x[out_ptr], #0x30]\n"
"str q24, [%x[out_ptr], #0x40]\n"
"str q21, [%x[out_ptr], #0x50]\n"
"str q18, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
- "subs %x[width], %x[width], #0x8\n"
- "cmp %x[width], #0x8\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 2b\n"
"3:" // Main loop skip
@@ -122,28 +122,28 @@ void interleave_block<8, 4, VLType::None, false>(
"ldr d24, [x26], #0x8\n"
"ldr d25, [x25], #0x8\n"
"ldr d21, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"ldr d18, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
"ldr d16, [x20], #0x8\n"
"tbz %x[width], #1, 4f\n"
"ld1 { v27.s }[2], [x27], #0x4\n"
+ "mov x19, #0x2\n"
"ld1 { v24.s }[2], [x26], #0x4\n"
"ld1 { v25.s }[2], [x25], #0x4\n"
"ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
"ld1 { v18.s }[2], [x22], #0x4\n"
- "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v20.s }[2], [x21], #0x4\n"
"ld1 { v16.s }[2], [x20], #0x4\n"
- "mov x19, #0x2\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v27.h }[6], [x27]\n"
"ld1 { v24.h }[6], [x26]\n"
"ld1 { v25.h }[6], [x25]\n"
"ld1 { v21.h }[6], [x24]\n"
- "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v23.h }[6], [x23]\n"
"ld1 { v18.h }[6], [x22]\n"
- "ld1 { v19.h }[6], [x21]\n"
+ "ld1 { v20.h }[6], [x21]\n"
"ld1 { v16.h }[6], [x20]\n"
"b 7f\n"
"4:" // odd_loads_1_4
@@ -151,69 +151,69 @@ void interleave_block<8, 4, VLType::None, false>(
"tbz %x[width], #0, 7f\n"
"ld1 { v27.h }[4], [x27]\n"
"ld1 { v24.h }[4], [x26]\n"
+ "mov x19, #0x2\n"
"ld1 { v25.h }[4], [x25]\n"
"ld1 { v21.h }[4], [x24]\n"
- "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v23.h }[4], [x23]\n"
"ld1 { v18.h }[4], [x22]\n"
- "ld1 { v19.h }[4], [x21]\n"
+ "ld1 { v20.h }[4], [x21]\n"
"ld1 { v16.h }[4], [x20]\n"
- "mov x19, #0x2\n"
"b 7f\n"
"5:" // odd_loads_2_0
"tbz %x[width], #1, 6f\n"
"ldr s27, [x27], #0x4\n"
"ldr s24, [x26], #0x4\n"
+ "mov x19, #0x1\n"
"ldr s25, [x25], #0x4\n"
"ldr s21, [x24], #0x4\n"
- "ldr s22, [x23], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
"ldr s18, [x22], #0x4\n"
- "ldr s19, [x21], #0x4\n"
+ "ldr s20, [x21], #0x4\n"
"ldr s16, [x20], #0x4\n"
- "mov x19, #0x1\n"
"tbz %x[width], #0, 7f\n"
"ld1 { v27.h }[2], [x27]\n"
"ld1 { v24.h }[2], [x26]\n"
"ld1 { v25.h }[2], [x25]\n"
"ld1 { v21.h }[2], [x24]\n"
- "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v23.h }[2], [x23]\n"
"ld1 { v18.h }[2], [x22]\n"
- "ld1 { v19.h }[2], [x21]\n"
+ "ld1 { v20.h }[2], [x21]\n"
"ld1 { v16.h }[2], [x20]\n"
"b 7f\n"
"6:" // odd_loads_1_0
"ldr h27, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr h24, [x26, #0x0]\n"
"ldr h25, [x25, #0x0]\n"
"ldr h21, [x24, #0x0]\n"
- "ldr h22, [x23, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
"ldr h18, [x22, #0x0]\n"
- "ldr h19, [x21, #0x0]\n"
+ "ldr h20, [x21, #0x0]\n"
"ldr h16, [x20, #0x0]\n"
- "mov x19, #0x1\n"
"7:" // Odd load end
"zip1 v26.2d, v27.2d, v24.2d\n"
- "subs x19, x19, #0x1\n"
- "zip1 v23.2d, v25.2d, v21.2d\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip1 v20.2d, v22.2d, v18.2d\n"
- "str q23, [%x[out_ptr], #0x10]\n"
- "zip1 v17.2d, v19.2d, v16.2d\n"
- "str q20, [%x[out_ptr], #0x20]\n"
+ "zip1 v22.2d, v25.2d, v21.2d\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v19.2d, v23.2d, v18.2d\n"
+ "str q22, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v20.2d, v16.2d\n"
+ "str q19, [%x[out_ptr], #0x20]\n"
"str q17, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 8f\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "zip2 v21.2d, v25.2d, v21.2d\n"
"str q24, [%x[out_ptr], #0x0]\n"
- "zip2 v18.2d, v22.2d, v18.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "zip2 v18.2d, v23.2d, v18.2d\n"
"str q21, [%x[out_ptr], #0x10]\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
+ "zip2 v16.2d, v20.2d, v16.2d\n"
"str q18, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"8:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
index a9034f5742..659d9947e2 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,45 +80,45 @@ void interleave_block<8, 4, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q28, [x27], #0x10\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "ldr q29, [x26], #0x10\n"
+ "cmp %x[width], #0x10\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "ldr q27, [x23], #0x10\n"
+ "ldr q26, [x22], #0x10\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
+ "ldr q24, [x20], #0x10\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ldr q27, [x26], #0x10\n"
- "ldr q26, [x25], #0x10\n"
- "zip1 v23.4s, v28.4s, v26.4s\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
- "ldr q22, [x24], #0x10\n"
- "zip2 v26.4s, v28.4s, v26.4s\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr q25, [x23], #0x10\n"
- "zip1 v20.4s, v27.4s, v22.4s\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr q24, [x22], #0x10\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q19, [x21], #0x10\n"
- "zip2 v23.4s, v23.4s, v20.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "zip2 v22.4s, v27.4s, v22.4s\n"
- "ldr q21, [x20], #0x10\n"
- "zip1 v18.4s, v25.4s, v19.4s\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v20.4s, v26.4s, v22.4s\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v16.4s, v24.4s, v21.4s\n"
- "subs %x[width], %x[width], #0x10\n"
- "zip1 v17.4s, v18.4s, v16.4s\n"
- "cmp %x[width], #0x10\n"
- "zip2 v16.4s, v18.4s, v16.4s\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
"str q17, [%x[out_ptr], #0x10]\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
- "str q23, [%x[out_ptr], #0x20]\n"
- "zip2 v18.4s, v24.4s, v21.4s\n"
- "str q16, [%x[out_ptr], #0x30]\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q20, [%x[out_ptr], #0x40]\n"
- "zip2 v17.4s, v26.4s, v22.4s\n"
- "str q16, [%x[out_ptr], #0x50]\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v17.4s, v28.4s, v25.4s\n"
+ "str q22, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
+ "str q20, [%x[out_ptr], #0x30]\n"
+ "str q21, [%x[out_ptr], #0x40]\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
"str q17, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
@@ -127,203 +127,203 @@ void interleave_block<8, 4, VLType::None, false>(
"cbz %x[width], 12f\n"
"tbz %x[width], #3, 7f\n"
"ldr d28, [x27], #0x8\n"
- "ldr d27, [x26], #0x8\n"
- "ldr d26, [x25], #0x8\n"
- "ldr d22, [x24], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d24, [x22], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"ldr d19, [x21], #0x8\n"
- "ldr d21, [x20], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
"tbz %x[width], #2, 5f\n"
"ld1 { v28.s }[2], [x27], #0x4\n"
- "ld1 { v27.s }[2], [x26], #0x4\n"
- "ld1 { v26.s }[2], [x25], #0x4\n"
- "ld1 { v22.s }[2], [x24], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
- "ld1 { v24.s }[2], [x22], #0x4\n"
+ "ld1 { v29.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v27.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
"ld1 { v19.s }[2], [x21], #0x4\n"
- "ld1 { v21.s }[2], [x20], #0x4\n"
+ "ld1 { v24.s }[2], [x20], #0x4\n"
"tbz %x[width], #1, 4f\n"
"ld1 { v28.h }[6], [x27], #0x2\n"
- "ld1 { v27.h }[6], [x26], #0x2\n"
- "ld1 { v26.h }[6], [x25], #0x2\n"
- "ld1 { v22.h }[6], [x24], #0x2\n"
- "ld1 { v25.h }[6], [x23], #0x2\n"
- "ld1 { v24.h }[6], [x22], #0x2\n"
- "ld1 { v19.h }[6], [x21], #0x2\n"
- "ld1 { v21.h }[6], [x20], #0x2\n"
"mov x19, #0x4\n"
+ "ld1 { v29.h }[6], [x26], #0x2\n"
+ "ld1 { v25.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v27.h }[6], [x23], #0x2\n"
+ "ld1 { v26.h }[6], [x22], #0x2\n"
+ "ld1 { v19.h }[6], [x21], #0x2\n"
+ "ld1 { v24.h }[6], [x20], #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[14], [x27]\n"
- "ld1 { v27.b }[14], [x26]\n"
- "ld1 { v26.b }[14], [x25]\n"
- "ld1 { v22.b }[14], [x24]\n"
- "ld1 { v25.b }[14], [x23]\n"
- "ld1 { v24.b }[14], [x22]\n"
+ "ld1 { v29.b }[14], [x26]\n"
+ "ld1 { v25.b }[14], [x25]\n"
+ "ld1 { v21.b }[14], [x24]\n"
+ "ld1 { v27.b }[14], [x23]\n"
+ "ld1 { v26.b }[14], [x22]\n"
"ld1 { v19.b }[14], [x21]\n"
- "ld1 { v21.b }[14], [x20]\n"
+ "ld1 { v24.b }[14], [x20]\n"
"b 11f\n"
"4:" // odd_loads_1_12
"mov x19, #0x3\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[12], [x27]\n"
- "ld1 { v27.b }[12], [x26]\n"
- "ld1 { v26.b }[12], [x25]\n"
- "ld1 { v22.b }[12], [x24]\n"
- "ld1 { v25.b }[12], [x23]\n"
- "ld1 { v24.b }[12], [x22]\n"
- "ld1 { v19.b }[12], [x21]\n"
- "ld1 { v21.b }[12], [x20]\n"
+ "ld1 { v29.b }[12], [x26]\n"
"mov x19, #0x4\n"
+ "ld1 { v25.b }[12], [x25]\n"
+ "ld1 { v21.b }[12], [x24]\n"
+ "ld1 { v27.b }[12], [x23]\n"
+ "ld1 { v26.b }[12], [x22]\n"
+ "ld1 { v19.b }[12], [x21]\n"
+ "ld1 { v24.b }[12], [x20]\n"
"b 11f\n"
"5:" // odd_loads_2_8
"tbz %x[width], #1, 6f\n"
"ld1 { v28.h }[4], [x27], #0x2\n"
- "ld1 { v27.h }[4], [x26], #0x2\n"
- "ld1 { v26.h }[4], [x25], #0x2\n"
- "ld1 { v22.h }[4], [x24], #0x2\n"
- "ld1 { v25.h }[4], [x23], #0x2\n"
- "ld1 { v24.h }[4], [x22], #0x2\n"
- "ld1 { v19.h }[4], [x21], #0x2\n"
- "ld1 { v21.h }[4], [x20], #0x2\n"
+ "ld1 { v29.h }[4], [x26], #0x2\n"
"mov x19, #0x3\n"
+ "ld1 { v25.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v27.h }[4], [x23], #0x2\n"
+ "ld1 { v26.h }[4], [x22], #0x2\n"
+ "ld1 { v19.h }[4], [x21], #0x2\n"
+ "ld1 { v24.h }[4], [x20], #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[10], [x27]\n"
- "ld1 { v27.b }[10], [x26]\n"
- "ld1 { v26.b }[10], [x25]\n"
- "ld1 { v22.b }[10], [x24]\n"
- "ld1 { v25.b }[10], [x23]\n"
- "ld1 { v24.b }[10], [x22]\n"
+ "ld1 { v29.b }[10], [x26]\n"
+ "ld1 { v25.b }[10], [x25]\n"
+ "ld1 { v21.b }[10], [x24]\n"
+ "ld1 { v27.b }[10], [x23]\n"
+ "ld1 { v26.b }[10], [x22]\n"
"ld1 { v19.b }[10], [x21]\n"
- "ld1 { v21.b }[10], [x20]\n"
+ "ld1 { v24.b }[10], [x20]\n"
"b 11f\n"
"6:" // odd_loads_1_8
"mov x19, #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[8], [x27]\n"
- "ld1 { v27.b }[8], [x26]\n"
- "ld1 { v26.b }[8], [x25]\n"
- "ld1 { v22.b }[8], [x24]\n"
- "ld1 { v25.b }[8], [x23]\n"
- "ld1 { v24.b }[8], [x22]\n"
- "ld1 { v19.b }[8], [x21]\n"
- "ld1 { v21.b }[8], [x20]\n"
+ "ld1 { v29.b }[8], [x26]\n"
"mov x19, #0x3\n"
+ "ld1 { v25.b }[8], [x25]\n"
+ "ld1 { v21.b }[8], [x24]\n"
+ "ld1 { v27.b }[8], [x23]\n"
+ "ld1 { v26.b }[8], [x22]\n"
+ "ld1 { v19.b }[8], [x21]\n"
+ "ld1 { v24.b }[8], [x20]\n"
"b 11f\n"
"7:" // odd_loads_4_0
"tbz %x[width], #2, 9f\n"
"ldr s28, [x27], #0x4\n"
- "ldr s27, [x26], #0x4\n"
- "ldr s26, [x25], #0x4\n"
- "ldr s22, [x24], #0x4\n"
- "ldr s25, [x23], #0x4\n"
- "ldr s24, [x22], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s27, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
"ldr s19, [x21], #0x4\n"
- "ldr s21, [x20], #0x4\n"
+ "ldr s24, [x20], #0x4\n"
"tbz %x[width], #1, 8f\n"
"ld1 { v28.h }[2], [x27], #0x2\n"
- "ld1 { v27.h }[2], [x26], #0x2\n"
- "ld1 { v26.h }[2], [x25], #0x2\n"
- "ld1 { v22.h }[2], [x24], #0x2\n"
- "ld1 { v25.h }[2], [x23], #0x2\n"
- "ld1 { v24.h }[2], [x22], #0x2\n"
- "ld1 { v19.h }[2], [x21], #0x2\n"
- "ld1 { v21.h }[2], [x20], #0x2\n"
"mov x19, #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
+ "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[6], [x27]\n"
- "ld1 { v27.b }[6], [x26]\n"
- "ld1 { v26.b }[6], [x25]\n"
- "ld1 { v22.b }[6], [x24]\n"
- "ld1 { v25.b }[6], [x23]\n"
- "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v21.b }[6], [x24]\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
"ld1 { v19.b }[6], [x21]\n"
- "ld1 { v21.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 11f\n"
"8:" // odd_loads_1_4
"mov x19, #0x1\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[4], [x27]\n"
- "ld1 { v27.b }[4], [x26]\n"
- "ld1 { v26.b }[4], [x25]\n"
- "ld1 { v22.b }[4], [x24]\n"
- "ld1 { v25.b }[4], [x23]\n"
- "ld1 { v24.b }[4], [x22]\n"
- "ld1 { v19.b }[4], [x21]\n"
- "ld1 { v21.b }[4], [x20]\n"
+ "ld1 { v29.b }[4], [x26]\n"
"mov x19, #0x2\n"
+ "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v21.b }[4], [x24]\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
+ "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 11f\n"
"9:" // odd_loads_2_0
"tbz %x[width], #1, 10f\n"
"ldr h28, [x27], #0x2\n"
- "ldr h27, [x26], #0x2\n"
- "ldr h26, [x25], #0x2\n"
- "ldr h22, [x24], #0x2\n"
- "ldr h25, [x23], #0x2\n"
- "ldr h24, [x22], #0x2\n"
- "ldr h19, [x21], #0x2\n"
- "ldr h21, [x20], #0x2\n"
+ "ldr h29, [x26], #0x2\n"
"mov x19, #0x1\n"
+ "ldr h25, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h27, [x23], #0x2\n"
+ "ldr h26, [x22], #0x2\n"
+ "ldr h19, [x21], #0x2\n"
+ "ldr h24, [x20], #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v28.b }[2], [x27]\n"
- "ld1 { v27.b }[2], [x26]\n"
- "ld1 { v26.b }[2], [x25]\n"
- "ld1 { v22.b }[2], [x24]\n"
- "ld1 { v25.b }[2], [x23]\n"
- "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v21.b }[2], [x24]\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
"ld1 { v19.b }[2], [x21]\n"
- "ld1 { v21.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 11f\n"
"10:" // odd_loads_1_0
"ldr b28, [x27, #0x0]\n"
- "ldr b27, [x26, #0x0]\n"
- "ldr b26, [x25, #0x0]\n"
- "ldr b22, [x24, #0x0]\n"
- "ldr b25, [x23, #0x0]\n"
- "ldr b24, [x22, #0x0]\n"
- "ldr b19, [x21, #0x0]\n"
- "ldr b21, [x20, #0x0]\n"
"mov x19, #0x1\n"
+ "ldr b29, [x26, #0x0]\n"
+ "ldr b25, [x25, #0x0]\n"
+ "ldr b21, [x24, #0x0]\n"
+ "ldr b27, [x23, #0x0]\n"
+ "ldr b26, [x22, #0x0]\n"
+ "ldr b19, [x21, #0x0]\n"
+ "ldr b24, [x20, #0x0]\n"
"11:" // Odd load end
- "zip1 v23.4s, v28.4s, v26.4s\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
"subs x19, x19, #0x1\n"
- "zip1 v20.4s, v27.4s, v22.4s\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.4s, v25.4s, v19.4s\n"
- "zip1 v16.4s, v24.4s, v21.4s\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
"zip1 v17.4s, v18.4s, v16.4s\n"
"str q17, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 12f\n"
- "zip2 v23.4s, v23.4s, v20.4s\n"
- "zip2 v16.4s, v18.4s, v16.4s\n"
- "str q23, [%x[out_ptr], #0x0]\n"
- "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
+ "str q22, [%x[out_ptr], #0x0]\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
"subs x19, x19, #0x1\n"
+ "str q20, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 12f\n"
- "zip2 v26.4s, v28.4s, v26.4s\n"
- "zip2 v22.4s, v27.4s, v22.4s\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
"subs x19, x19, #0x1\n"
- "zip1 v20.4s, v26.4s, v22.4s\n"
- "str q20, [%x[out_ptr], #0x0]\n"
- "zip2 v19.4s, v25.4s, v19.4s\n"
- "zip2 v18.4s, v24.4s, v21.4s\n"
- "zip1 v16.4s, v19.4s, v18.4s\n"
- "str q16, [%x[out_ptr], #0x10]\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
+ "str q21, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 12f\n"
- "zip2 v17.4s, v26.4s, v22.4s\n"
- "zip2 v16.4s, v19.4s, v18.4s\n"
+ "zip2 v17.4s, v28.4s, v25.4s\n"
"str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
"str q16, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"12:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
index 2831cb79a6..dfec94c952 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -97,270 +97,270 @@ void interleave_block<8, 4, VLType::None, true>(
"movi v0.8h, #0x0\n"
"mov x19, #0x0\n"
"4:" // no_accumulate_16
- "ldr q29, [x27], #0x10\n"
+ "ldr q28, [x27], #0x10\n"
+ "add x19, x19, #0x1\n"
+ "ldr q29, [x26], #0x10\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
+ "ldr q21, [x24], #0x10\n"
+ "cmp %x[width], #0x10\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "ldr q27, [x23], #0x10\n"
+ "ldr q26, [x22], #0x10\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
+ "ldr q24, [x20], #0x10\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ldr q28, [x26], #0x10\n"
- "ldr q27, [x25], #0x10\n"
- "zip1 v23.4s, v29.4s, v27.4s\n"
+ "sadalp v1.8h, v23.16b\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
- "ldr q21, [x24], #0x10\n"
- "zip2 v27.4s, v29.4s, v27.4s\n"
+ "sadalp v1.8h, v22.16b\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr q26, [x23], #0x10\n"
- "zip1 v20.4s, v28.4s, v21.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr q25, [x22], #0x10\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q19, [x21], #0x10\n"
- "zip2 v24.4s, v23.4s, v20.4s\n"
+ "sadalp v1.8h, v21.16b\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "zip2 v23.4s, v28.4s, v21.4s\n"
- "ldr q22, [x20], #0x10\n"
- "zip1 v18.4s, v26.4s, v19.4s\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v21.4s, v27.4s, v23.4s\n"
+ "sadalp v0.8h, v17.16b\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v17.4s, v25.4s, v22.4s\n"
- "sadalp v1.8h, v16.16b\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
- "add x19, x19, #0x1\n"
- "zip2 v20.4s, v18.4s, v17.4s\n"
- "str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v19.4s, v26.4s, v19.4s\n"
- "sadalp v0.8h, v16.16b\n"
- "zip2 v16.4s, v25.4s, v22.4s\n"
- "str q24, [%x[out_ptr], #0x20]\n"
+ "sadalp v0.8h, v20.16b\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
"zip1 v18.4s, v19.4s, v16.4s\n"
- "sadalp v1.8h, v24.16b\n"
- "zip2 v17.4s, v27.4s, v23.4s\n"
- "str q20, [%x[out_ptr], #0x30]\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "sadalp v0.8h, v18.16b\n"
+ "zip2 v17.4s, v28.4s, v25.4s\n"
+ "str q22, [%x[out_ptr], #0x20]\n"
"zip2 v16.4s, v19.4s, v16.4s\n"
+ "str q20, [%x[out_ptr], #0x30]\n"
+ "sadalp v1.8h, v17.16b\n"
"str q21, [%x[out_ptr], #0x40]\n"
+ "sadalp v0.8h, v16.16b\n"
"str q18, [%x[out_ptr], #0x50]\n"
- "sadalp v0.8h, v20.16b\n"
"str q17, [%x[out_ptr], #0x60]\n"
- "sadalp v1.8h, v21.16b\n"
"str q16, [%x[out_ptr], #0x70]\n"
- "subs %x[width], %x[width], #0x10\n"
- "sadalp v0.8h, v18.16b\n"
- "cmp %x[width], #0x10\n"
- "sadalp v1.8h, v17.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
- "sadalp v0.8h, v16.16b\n"
"bge 3b\n"
"5:" // Main loop skip
"cbz %x[width], 14f\n"
"tbz %x[width], #3, 9f\n"
- "ldr d29, [x27], #0x8\n"
- "ldr d28, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
"ldr d21, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d25, [x22], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"ldr d19, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
"tbz %x[width], #2, 7f\n"
- "ld1 { v29.s }[2], [x27], #0x4\n"
- "ld1 { v28.s }[2], [x26], #0x4\n"
- "ld1 { v27.s }[2], [x25], #0x4\n"
+ "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v29.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
"ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
- "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v27.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
"ld1 { v19.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v24.s }[2], [x20], #0x4\n"
"tbz %x[width], #1, 6f\n"
- "ld1 { v29.h }[6], [x27], #0x2\n"
- "ld1 { v28.h }[6], [x26], #0x2\n"
- "ld1 { v27.h }[6], [x25], #0x2\n"
+ "ld1 { v28.h }[6], [x27], #0x2\n"
+ "mov x19, #0x4\n"
+ "ld1 { v29.h }[6], [x26], #0x2\n"
+ "ld1 { v25.h }[6], [x25], #0x2\n"
"ld1 { v21.h }[6], [x24], #0x2\n"
- "ld1 { v26.h }[6], [x23], #0x2\n"
- "ld1 { v25.h }[6], [x22], #0x2\n"
+ "ld1 { v27.h }[6], [x23], #0x2\n"
+ "ld1 { v26.h }[6], [x22], #0x2\n"
"ld1 { v19.h }[6], [x21], #0x2\n"
- "ld1 { v22.h }[6], [x20], #0x2\n"
- "mov x19, #0x4\n"
+ "ld1 { v24.h }[6], [x20], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[14], [x27]\n"
- "ld1 { v28.b }[14], [x26]\n"
- "ld1 { v27.b }[14], [x25]\n"
+ "ld1 { v28.b }[14], [x27]\n"
+ "ld1 { v29.b }[14], [x26]\n"
+ "ld1 { v25.b }[14], [x25]\n"
"ld1 { v21.b }[14], [x24]\n"
- "ld1 { v26.b }[14], [x23]\n"
- "ld1 { v25.b }[14], [x22]\n"
+ "ld1 { v27.b }[14], [x23]\n"
+ "ld1 { v26.b }[14], [x22]\n"
"ld1 { v19.b }[14], [x21]\n"
- "ld1 { v22.b }[14], [x20]\n"
+ "ld1 { v24.b }[14], [x20]\n"
"b 13f\n"
"6:" // odd_loads_1_12
"mov x19, #0x3\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[12], [x27]\n"
- "ld1 { v28.b }[12], [x26]\n"
- "ld1 { v27.b }[12], [x25]\n"
+ "ld1 { v28.b }[12], [x27]\n"
+ "ld1 { v29.b }[12], [x26]\n"
+ "mov x19, #0x4\n"
+ "ld1 { v25.b }[12], [x25]\n"
"ld1 { v21.b }[12], [x24]\n"
- "ld1 { v26.b }[12], [x23]\n"
- "ld1 { v25.b }[12], [x22]\n"
+ "ld1 { v27.b }[12], [x23]\n"
+ "ld1 { v26.b }[12], [x22]\n"
"ld1 { v19.b }[12], [x21]\n"
- "ld1 { v22.b }[12], [x20]\n"
- "mov x19, #0x4\n"
+ "ld1 { v24.b }[12], [x20]\n"
"b 13f\n"
"7:" // odd_loads_2_8
"tbz %x[width], #1, 8f\n"
- "ld1 { v29.h }[4], [x27], #0x2\n"
- "ld1 { v28.h }[4], [x26], #0x2\n"
- "ld1 { v27.h }[4], [x25], #0x2\n"
+ "ld1 { v28.h }[4], [x27], #0x2\n"
+ "ld1 { v29.h }[4], [x26], #0x2\n"
+ "mov x19, #0x3\n"
+ "ld1 { v25.h }[4], [x25], #0x2\n"
"ld1 { v21.h }[4], [x24], #0x2\n"
- "ld1 { v26.h }[4], [x23], #0x2\n"
- "ld1 { v25.h }[4], [x22], #0x2\n"
+ "ld1 { v27.h }[4], [x23], #0x2\n"
+ "ld1 { v26.h }[4], [x22], #0x2\n"
"ld1 { v19.h }[4], [x21], #0x2\n"
- "ld1 { v22.h }[4], [x20], #0x2\n"
- "mov x19, #0x3\n"
+ "ld1 { v24.h }[4], [x20], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[10], [x27]\n"
- "ld1 { v28.b }[10], [x26]\n"
- "ld1 { v27.b }[10], [x25]\n"
+ "ld1 { v28.b }[10], [x27]\n"
+ "ld1 { v29.b }[10], [x26]\n"
+ "ld1 { v25.b }[10], [x25]\n"
"ld1 { v21.b }[10], [x24]\n"
- "ld1 { v26.b }[10], [x23]\n"
- "ld1 { v25.b }[10], [x22]\n"
+ "ld1 { v27.b }[10], [x23]\n"
+ "ld1 { v26.b }[10], [x22]\n"
"ld1 { v19.b }[10], [x21]\n"
- "ld1 { v22.b }[10], [x20]\n"
+ "ld1 { v24.b }[10], [x20]\n"
"b 13f\n"
"8:" // odd_loads_1_8
"mov x19, #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[8], [x27]\n"
- "ld1 { v28.b }[8], [x26]\n"
- "ld1 { v27.b }[8], [x25]\n"
+ "ld1 { v28.b }[8], [x27]\n"
+ "ld1 { v29.b }[8], [x26]\n"
+ "mov x19, #0x3\n"
+ "ld1 { v25.b }[8], [x25]\n"
"ld1 { v21.b }[8], [x24]\n"
- "ld1 { v26.b }[8], [x23]\n"
- "ld1 { v25.b }[8], [x22]\n"
+ "ld1 { v27.b }[8], [x23]\n"
+ "ld1 { v26.b }[8], [x22]\n"
"ld1 { v19.b }[8], [x21]\n"
- "ld1 { v22.b }[8], [x20]\n"
- "mov x19, #0x3\n"
+ "ld1 { v24.b }[8], [x20]\n"
"b 13f\n"
"9:" // odd_loads_4_0
"tbz %x[width], #2, 11f\n"
- "ldr s29, [x27], #0x4\n"
- "ldr s28, [x26], #0x4\n"
- "ldr s27, [x25], #0x4\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
"ldr s21, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s25, [x22], #0x4\n"
+ "ldr s27, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
"ldr s19, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s24, [x20], #0x4\n"
"tbz %x[width], #1, 10f\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "mov x19, #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
"ld1 { v21.h }[2], [x24], #0x2\n"
- "ld1 { v26.h }[2], [x23], #0x2\n"
- "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
"ld1 { v19.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
- "mov x19, #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v25.b }[6], [x25]\n"
"ld1 { v21.b }[6], [x24]\n"
- "ld1 { v26.b }[6], [x23]\n"
- "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
"ld1 { v19.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 13f\n"
"10:" // odd_loads_1_4
"mov x19, #0x1\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x26]\n"
+ "mov x19, #0x2\n"
+ "ld1 { v25.b }[4], [x25]\n"
"ld1 { v21.b }[4], [x24]\n"
- "ld1 { v26.b }[4], [x23]\n"
- "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
"ld1 { v19.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
- "mov x19, #0x2\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 13f\n"
"11:" // odd_loads_2_0
"tbz %x[width], #1, 12f\n"
- "ldr h29, [x27], #0x2\n"
- "ldr h28, [x26], #0x2\n"
- "ldr h27, [x25], #0x2\n"
+ "ldr h28, [x27], #0x2\n"
+ "ldr h29, [x26], #0x2\n"
+ "mov x19, #0x1\n"
+ "ldr h25, [x25], #0x2\n"
"ldr h21, [x24], #0x2\n"
- "ldr h26, [x23], #0x2\n"
- "ldr h25, [x22], #0x2\n"
+ "ldr h27, [x23], #0x2\n"
+ "ldr h26, [x22], #0x2\n"
"ldr h19, [x21], #0x2\n"
- "ldr h22, [x20], #0x2\n"
- "mov x19, #0x1\n"
+ "ldr h24, [x20], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v25.b }[2], [x25]\n"
"ld1 { v21.b }[2], [x24]\n"
- "ld1 { v26.b }[2], [x23]\n"
- "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
"ld1 { v19.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 13f\n"
"12:" // odd_loads_1_0
- "ldr b29, [x27, #0x0]\n"
- "ldr b28, [x26, #0x0]\n"
- "ldr b27, [x25, #0x0]\n"
+ "ldr b28, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
+ "ldr b29, [x26, #0x0]\n"
+ "ldr b25, [x25, #0x0]\n"
"ldr b21, [x24, #0x0]\n"
- "ldr b26, [x23, #0x0]\n"
- "ldr b25, [x22, #0x0]\n"
+ "ldr b27, [x23, #0x0]\n"
+ "ldr b26, [x22, #0x0]\n"
"ldr b19, [x21, #0x0]\n"
- "ldr b22, [x20, #0x0]\n"
- "mov x19, #0x1\n"
+ "ldr b24, [x20, #0x0]\n"
"13:" // Odd load end
- "zip1 v23.4s, v29.4s, v27.4s\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
"subs x19, x19, #0x1\n"
- "zip1 v20.4s, v28.4s, v21.4s\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.4s, v26.4s, v19.4s\n"
- "sadalp v1.8h, v16.16b\n"
- "zip1 v17.4s, v25.4s, v22.4s\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
- "str q16, [%x[out_ptr], #0x10]\n"
- "sadalp v0.8h, v16.16b\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "sadalp v1.8h, v23.16b\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "sadalp v0.8h, v17.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v24.4s, v23.4s, v20.4s\n"
- "zip2 v20.4s, v18.4s, v17.4s\n"
- "str q24, [%x[out_ptr], #0x0]\n"
- "sadalp v1.8h, v24.16b\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
+ "str q22, [%x[out_ptr], #0x0]\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
+ "sadalp v1.8h, v22.16b\n"
"str q20, [%x[out_ptr], #0x10]\n"
- "sadalp v0.8h, v20.16b\n"
"subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "sadalp v0.8h, v20.16b\n"
"beq 14f\n"
- "zip2 v27.4s, v29.4s, v27.4s\n"
- "zip2 v23.4s, v28.4s, v21.4s\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
"subs x19, x19, #0x1\n"
- "zip1 v21.4s, v27.4s, v23.4s\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
"str q21, [%x[out_ptr], #0x0]\n"
- "zip2 v19.4s, v26.4s, v19.4s\n"
"sadalp v1.8h, v21.16b\n"
- "zip2 v16.4s, v25.4s, v22.4s\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
"zip1 v18.4s, v19.4s, v16.4s\n"
"str q18, [%x[out_ptr], #0x10]\n"
"sadalp v0.8h, v18.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v17.4s, v27.4s, v23.4s\n"
- "zip2 v16.4s, v19.4s, v16.4s\n"
+ "zip2 v17.4s, v28.4s, v25.4s\n"
"str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
"sadalp v1.8h, v17.16b\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "sadalp v0.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "sadalp v0.8h, v16.16b\n"
"14:" // Odds skip
"sadalp v31.4s, v1.8h\n"
- "sadalp v30.4s, v0.8h\n"
"str q31, [%x[out_ptr], #0x0]\n"
+ "sadalp v30.4s, v0.8h\n"
"str q30, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
index 7c7857bcd0..1b94c7f1f1 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -97,270 +97,270 @@ void interleave_block<8, 4, VLType::None, true>(
"movi v0.8h, #0x0\n"
"mov x19, #0x0\n"
"4:" // no_accumulate_16
- "ldr q29, [x27], #0x10\n"
+ "ldr q28, [x27], #0x10\n"
+ "add x19, x19, #0x1\n"
+ "ldr q29, [x26], #0x10\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
+ "ldr q21, [x24], #0x10\n"
+ "cmp %x[width], #0x10\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "ldr q27, [x23], #0x10\n"
+ "ldr q26, [x22], #0x10\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
+ "ldr q24, [x20], #0x10\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x27, #0x70]\n"
- "ldr q28, [x26], #0x10\n"
- "ldr q27, [x25], #0x10\n"
- "zip1 v23.4s, v29.4s, v27.4s\n"
+ "uadalp v1.8h, v23.16b\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
"prfm pldl1keep, [x26, #0x70]\n"
- "ldr q21, [x24], #0x10\n"
- "zip2 v27.4s, v29.4s, v27.4s\n"
+ "uadalp v1.8h, v22.16b\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x25, #0x70]\n"
- "ldr q26, [x23], #0x10\n"
- "zip1 v20.4s, v28.4s, v21.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
"prfm pldl1keep, [x24, #0x70]\n"
- "ldr q25, [x22], #0x10\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q19, [x21], #0x10\n"
- "zip2 v24.4s, v23.4s, v20.4s\n"
+ "uadalp v1.8h, v21.16b\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "zip2 v23.4s, v28.4s, v21.4s\n"
- "ldr q22, [x20], #0x10\n"
- "zip1 v18.4s, v26.4s, v19.4s\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v21.4s, v27.4s, v23.4s\n"
+ "uadalp v0.8h, v17.16b\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
"prfm pldl1keep, [x20, #0x70]\n"
- "zip1 v17.4s, v25.4s, v22.4s\n"
- "uadalp v1.8h, v16.16b\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
- "add x19, x19, #0x1\n"
- "zip2 v20.4s, v18.4s, v17.4s\n"
- "str q16, [%x[out_ptr], #0x10]\n"
- "zip2 v19.4s, v26.4s, v19.4s\n"
- "uadalp v0.8h, v16.16b\n"
- "zip2 v16.4s, v25.4s, v22.4s\n"
- "str q24, [%x[out_ptr], #0x20]\n"
+ "uadalp v0.8h, v20.16b\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
"zip1 v18.4s, v19.4s, v16.4s\n"
- "uadalp v1.8h, v24.16b\n"
- "zip2 v17.4s, v27.4s, v23.4s\n"
- "str q20, [%x[out_ptr], #0x30]\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "uadalp v0.8h, v18.16b\n"
+ "zip2 v17.4s, v28.4s, v25.4s\n"
+ "str q22, [%x[out_ptr], #0x20]\n"
"zip2 v16.4s, v19.4s, v16.4s\n"
+ "str q20, [%x[out_ptr], #0x30]\n"
+ "uadalp v1.8h, v17.16b\n"
"str q21, [%x[out_ptr], #0x40]\n"
+ "uadalp v0.8h, v16.16b\n"
"str q18, [%x[out_ptr], #0x50]\n"
- "uadalp v0.8h, v20.16b\n"
"str q17, [%x[out_ptr], #0x60]\n"
- "uadalp v1.8h, v21.16b\n"
"str q16, [%x[out_ptr], #0x70]\n"
- "subs %x[width], %x[width], #0x10\n"
- "uadalp v0.8h, v18.16b\n"
- "cmp %x[width], #0x10\n"
- "uadalp v1.8h, v17.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
- "uadalp v0.8h, v16.16b\n"
"bge 3b\n"
"5:" // Main loop skip
"cbz %x[width], 14f\n"
"tbz %x[width], #3, 9f\n"
- "ldr d29, [x27], #0x8\n"
- "ldr d28, [x26], #0x8\n"
- "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
"ldr d21, [x24], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d25, [x22], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
"ldr d19, [x21], #0x8\n"
- "ldr d22, [x20], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
"tbz %x[width], #2, 7f\n"
- "ld1 { v29.s }[2], [x27], #0x4\n"
- "ld1 { v28.s }[2], [x26], #0x4\n"
- "ld1 { v27.s }[2], [x25], #0x4\n"
+ "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v29.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
"ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
- "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v27.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
"ld1 { v19.s }[2], [x21], #0x4\n"
- "ld1 { v22.s }[2], [x20], #0x4\n"
+ "ld1 { v24.s }[2], [x20], #0x4\n"
"tbz %x[width], #1, 6f\n"
- "ld1 { v29.h }[6], [x27], #0x2\n"
- "ld1 { v28.h }[6], [x26], #0x2\n"
- "ld1 { v27.h }[6], [x25], #0x2\n"
+ "ld1 { v28.h }[6], [x27], #0x2\n"
+ "mov x19, #0x4\n"
+ "ld1 { v29.h }[6], [x26], #0x2\n"
+ "ld1 { v25.h }[6], [x25], #0x2\n"
"ld1 { v21.h }[6], [x24], #0x2\n"
- "ld1 { v26.h }[6], [x23], #0x2\n"
- "ld1 { v25.h }[6], [x22], #0x2\n"
+ "ld1 { v27.h }[6], [x23], #0x2\n"
+ "ld1 { v26.h }[6], [x22], #0x2\n"
"ld1 { v19.h }[6], [x21], #0x2\n"
- "ld1 { v22.h }[6], [x20], #0x2\n"
- "mov x19, #0x4\n"
+ "ld1 { v24.h }[6], [x20], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[14], [x27]\n"
- "ld1 { v28.b }[14], [x26]\n"
- "ld1 { v27.b }[14], [x25]\n"
+ "ld1 { v28.b }[14], [x27]\n"
+ "ld1 { v29.b }[14], [x26]\n"
+ "ld1 { v25.b }[14], [x25]\n"
"ld1 { v21.b }[14], [x24]\n"
- "ld1 { v26.b }[14], [x23]\n"
- "ld1 { v25.b }[14], [x22]\n"
+ "ld1 { v27.b }[14], [x23]\n"
+ "ld1 { v26.b }[14], [x22]\n"
"ld1 { v19.b }[14], [x21]\n"
- "ld1 { v22.b }[14], [x20]\n"
+ "ld1 { v24.b }[14], [x20]\n"
"b 13f\n"
"6:" // odd_loads_1_12
"mov x19, #0x3\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[12], [x27]\n"
- "ld1 { v28.b }[12], [x26]\n"
- "ld1 { v27.b }[12], [x25]\n"
+ "ld1 { v28.b }[12], [x27]\n"
+ "ld1 { v29.b }[12], [x26]\n"
+ "mov x19, #0x4\n"
+ "ld1 { v25.b }[12], [x25]\n"
"ld1 { v21.b }[12], [x24]\n"
- "ld1 { v26.b }[12], [x23]\n"
- "ld1 { v25.b }[12], [x22]\n"
+ "ld1 { v27.b }[12], [x23]\n"
+ "ld1 { v26.b }[12], [x22]\n"
"ld1 { v19.b }[12], [x21]\n"
- "ld1 { v22.b }[12], [x20]\n"
- "mov x19, #0x4\n"
+ "ld1 { v24.b }[12], [x20]\n"
"b 13f\n"
"7:" // odd_loads_2_8
"tbz %x[width], #1, 8f\n"
- "ld1 { v29.h }[4], [x27], #0x2\n"
- "ld1 { v28.h }[4], [x26], #0x2\n"
- "ld1 { v27.h }[4], [x25], #0x2\n"
+ "ld1 { v28.h }[4], [x27], #0x2\n"
+ "ld1 { v29.h }[4], [x26], #0x2\n"
+ "mov x19, #0x3\n"
+ "ld1 { v25.h }[4], [x25], #0x2\n"
"ld1 { v21.h }[4], [x24], #0x2\n"
- "ld1 { v26.h }[4], [x23], #0x2\n"
- "ld1 { v25.h }[4], [x22], #0x2\n"
+ "ld1 { v27.h }[4], [x23], #0x2\n"
+ "ld1 { v26.h }[4], [x22], #0x2\n"
"ld1 { v19.h }[4], [x21], #0x2\n"
- "ld1 { v22.h }[4], [x20], #0x2\n"
- "mov x19, #0x3\n"
+ "ld1 { v24.h }[4], [x20], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[10], [x27]\n"
- "ld1 { v28.b }[10], [x26]\n"
- "ld1 { v27.b }[10], [x25]\n"
+ "ld1 { v28.b }[10], [x27]\n"
+ "ld1 { v29.b }[10], [x26]\n"
+ "ld1 { v25.b }[10], [x25]\n"
"ld1 { v21.b }[10], [x24]\n"
- "ld1 { v26.b }[10], [x23]\n"
- "ld1 { v25.b }[10], [x22]\n"
+ "ld1 { v27.b }[10], [x23]\n"
+ "ld1 { v26.b }[10], [x22]\n"
"ld1 { v19.b }[10], [x21]\n"
- "ld1 { v22.b }[10], [x20]\n"
+ "ld1 { v24.b }[10], [x20]\n"
"b 13f\n"
"8:" // odd_loads_1_8
"mov x19, #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[8], [x27]\n"
- "ld1 { v28.b }[8], [x26]\n"
- "ld1 { v27.b }[8], [x25]\n"
+ "ld1 { v28.b }[8], [x27]\n"
+ "ld1 { v29.b }[8], [x26]\n"
+ "mov x19, #0x3\n"
+ "ld1 { v25.b }[8], [x25]\n"
"ld1 { v21.b }[8], [x24]\n"
- "ld1 { v26.b }[8], [x23]\n"
- "ld1 { v25.b }[8], [x22]\n"
+ "ld1 { v27.b }[8], [x23]\n"
+ "ld1 { v26.b }[8], [x22]\n"
"ld1 { v19.b }[8], [x21]\n"
- "ld1 { v22.b }[8], [x20]\n"
- "mov x19, #0x3\n"
+ "ld1 { v24.b }[8], [x20]\n"
"b 13f\n"
"9:" // odd_loads_4_0
"tbz %x[width], #2, 11f\n"
- "ldr s29, [x27], #0x4\n"
- "ldr s28, [x26], #0x4\n"
- "ldr s27, [x25], #0x4\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
"ldr s21, [x24], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s25, [x22], #0x4\n"
+ "ldr s27, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
"ldr s19, [x21], #0x4\n"
- "ldr s22, [x20], #0x4\n"
+ "ldr s24, [x20], #0x4\n"
"tbz %x[width], #1, 10f\n"
- "ld1 { v29.h }[2], [x27], #0x2\n"
- "ld1 { v28.h }[2], [x26], #0x2\n"
- "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "mov x19, #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
"ld1 { v21.h }[2], [x24], #0x2\n"
- "ld1 { v26.h }[2], [x23], #0x2\n"
- "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x23], #0x2\n"
+ "ld1 { v26.h }[2], [x22], #0x2\n"
"ld1 { v19.h }[2], [x21], #0x2\n"
- "ld1 { v22.h }[2], [x20], #0x2\n"
- "mov x19, #0x2\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[6], [x27]\n"
- "ld1 { v28.b }[6], [x26]\n"
- "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v25.b }[6], [x25]\n"
"ld1 { v21.b }[6], [x24]\n"
- "ld1 { v26.b }[6], [x23]\n"
- "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x23]\n"
+ "ld1 { v26.b }[6], [x22]\n"
"ld1 { v19.b }[6], [x21]\n"
- "ld1 { v22.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x20]\n"
"b 13f\n"
"10:" // odd_loads_1_4
"mov x19, #0x1\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[4], [x27]\n"
- "ld1 { v28.b }[4], [x26]\n"
- "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x26]\n"
+ "mov x19, #0x2\n"
+ "ld1 { v25.b }[4], [x25]\n"
"ld1 { v21.b }[4], [x24]\n"
- "ld1 { v26.b }[4], [x23]\n"
- "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x23]\n"
+ "ld1 { v26.b }[4], [x22]\n"
"ld1 { v19.b }[4], [x21]\n"
- "ld1 { v22.b }[4], [x20]\n"
- "mov x19, #0x2\n"
+ "ld1 { v24.b }[4], [x20]\n"
"b 13f\n"
"11:" // odd_loads_2_0
"tbz %x[width], #1, 12f\n"
- "ldr h29, [x27], #0x2\n"
- "ldr h28, [x26], #0x2\n"
- "ldr h27, [x25], #0x2\n"
+ "ldr h28, [x27], #0x2\n"
+ "ldr h29, [x26], #0x2\n"
+ "mov x19, #0x1\n"
+ "ldr h25, [x25], #0x2\n"
"ldr h21, [x24], #0x2\n"
- "ldr h26, [x23], #0x2\n"
- "ldr h25, [x22], #0x2\n"
+ "ldr h27, [x23], #0x2\n"
+ "ldr h26, [x22], #0x2\n"
"ldr h19, [x21], #0x2\n"
- "ldr h22, [x20], #0x2\n"
- "mov x19, #0x1\n"
+ "ldr h24, [x20], #0x2\n"
"tbz %x[width], #0, 13f\n"
- "ld1 { v29.b }[2], [x27]\n"
- "ld1 { v28.b }[2], [x26]\n"
- "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v25.b }[2], [x25]\n"
"ld1 { v21.b }[2], [x24]\n"
- "ld1 { v26.b }[2], [x23]\n"
- "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x23]\n"
+ "ld1 { v26.b }[2], [x22]\n"
"ld1 { v19.b }[2], [x21]\n"
- "ld1 { v22.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x20]\n"
"b 13f\n"
"12:" // odd_loads_1_0
- "ldr b29, [x27, #0x0]\n"
- "ldr b28, [x26, #0x0]\n"
- "ldr b27, [x25, #0x0]\n"
+ "ldr b28, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
+ "ldr b29, [x26, #0x0]\n"
+ "ldr b25, [x25, #0x0]\n"
"ldr b21, [x24, #0x0]\n"
- "ldr b26, [x23, #0x0]\n"
- "ldr b25, [x22, #0x0]\n"
+ "ldr b27, [x23, #0x0]\n"
+ "ldr b26, [x22, #0x0]\n"
"ldr b19, [x21, #0x0]\n"
- "ldr b22, [x20, #0x0]\n"
- "mov x19, #0x1\n"
+ "ldr b24, [x20, #0x0]\n"
"13:" // Odd load end
- "zip1 v23.4s, v29.4s, v27.4s\n"
+ "zip1 v22.4s, v28.4s, v25.4s\n"
"subs x19, x19, #0x1\n"
- "zip1 v20.4s, v28.4s, v21.4s\n"
- "zip1 v16.4s, v23.4s, v20.4s\n"
- "str q16, [%x[out_ptr], #0x0]\n"
- "zip1 v18.4s, v26.4s, v19.4s\n"
- "uadalp v1.8h, v16.16b\n"
- "zip1 v17.4s, v25.4s, v22.4s\n"
- "zip1 v16.4s, v18.4s, v17.4s\n"
- "str q16, [%x[out_ptr], #0x10]\n"
- "uadalp v0.8h, v16.16b\n"
+ "zip1 v20.4s, v29.4s, v21.4s\n"
+ "zip1 v23.4s, v22.4s, v20.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "uadalp v1.8h, v23.16b\n"
+ "zip1 v18.4s, v27.4s, v19.4s\n"
+ "zip1 v16.4s, v26.4s, v24.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "uadalp v0.8h, v17.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v24.4s, v23.4s, v20.4s\n"
- "zip2 v20.4s, v18.4s, v17.4s\n"
- "str q24, [%x[out_ptr], #0x0]\n"
- "uadalp v1.8h, v24.16b\n"
+ "zip2 v22.4s, v22.4s, v20.4s\n"
+ "str q22, [%x[out_ptr], #0x0]\n"
+ "zip2 v20.4s, v18.4s, v16.4s\n"
+ "uadalp v1.8h, v22.16b\n"
"str q20, [%x[out_ptr], #0x10]\n"
- "uadalp v0.8h, v20.16b\n"
"subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "uadalp v0.8h, v20.16b\n"
"beq 14f\n"
- "zip2 v27.4s, v29.4s, v27.4s\n"
- "zip2 v23.4s, v28.4s, v21.4s\n"
+ "zip2 v28.4s, v28.4s, v25.4s\n"
+ "zip2 v25.4s, v29.4s, v21.4s\n"
"subs x19, x19, #0x1\n"
- "zip1 v21.4s, v27.4s, v23.4s\n"
+ "zip1 v21.4s, v28.4s, v25.4s\n"
"str q21, [%x[out_ptr], #0x0]\n"
- "zip2 v19.4s, v26.4s, v19.4s\n"
"uadalp v1.8h, v21.16b\n"
- "zip2 v16.4s, v25.4s, v22.4s\n"
+ "zip2 v19.4s, v27.4s, v19.4s\n"
+ "zip2 v16.4s, v26.4s, v24.4s\n"
"zip1 v18.4s, v19.4s, v16.4s\n"
"str q18, [%x[out_ptr], #0x10]\n"
"uadalp v0.8h, v18.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
"beq 14f\n"
- "zip2 v17.4s, v27.4s, v23.4s\n"
- "zip2 v16.4s, v19.4s, v16.4s\n"
+ "zip2 v17.4s, v28.4s, v25.4s\n"
"str q17, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
"uadalp v1.8h, v17.16b\n"
"str q16, [%x[out_ptr], #0x10]\n"
- "uadalp v0.8h, v16.16b\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "uadalp v0.8h, v16.16b\n"
"14:" // Odds skip
"uadalp v31.4s, v1.8h\n"
- "uadalp v30.4s, v0.8h\n"
"str q31, [%x[out_ptr], #0x0]\n"
+ "uadalp v30.4s, v0.8h\n"
"str q30, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
index 704a4c9210..1330593cbf 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,39 +80,39 @@ void interleave_block<8, 8, VLType::None, false>(
"blt 3f\n"
"2:" // Main loop head
"ldr q27, [x27], #0x10\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "subs %x[width], %x[width], #0x10\n"
"ldr q24, [x26], #0x10\n"
"zip1 v26.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x26, #0x70]\n"
"ldr q25, [x25], #0x10\n"
+ "cmp %x[width], #0x10\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x25, #0x70]\n"
"ldr q21, [x24], #0x10\n"
- "zip1 v23.2d, v25.2d, v21.2d\n"
- "prfm pldl1keep, [x24, #0x70]\n"
- "ldr q22, [x23], #0x10\n"
+ "ldr q23, [x23], #0x10\n"
+ "zip1 v22.2d, v25.2d, v21.2d\n"
+ "ldr q18, [x22], #0x10\n"
"zip2 v21.2d, v25.2d, v21.2d\n"
+ "ldr q20, [x21], #0x10\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v19.2d, v23.2d, v18.2d\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "zip2 v18.2d, v23.2d, v18.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "zip1 v17.2d, v20.2d, v16.2d\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "zip2 v16.2d, v20.2d, v16.2d\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
"prfm pldl1keep, [x23, #0x70]\n"
- "ldr q18, [x22], #0x10\n"
- "zip1 v20.2d, v22.2d, v18.2d\n"
"prfm pldl1keep, [x22, #0x70]\n"
- "ldr q19, [x21], #0x10\n"
- "zip2 v18.2d, v22.2d, v18.2d\n"
"prfm pldl1keep, [x21, #0x70]\n"
- "ldr q16, [x20], #0x10\n"
- "zip1 v17.2d, v19.2d, v16.2d\n"
"prfm pldl1keep, [x20, #0x70]\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
- "str q23, [%x[out_ptr], #0x10]\n"
- "str q20, [%x[out_ptr], #0x20]\n"
+ "str q22, [%x[out_ptr], #0x10]\n"
+ "str q19, [%x[out_ptr], #0x20]\n"
"str q17, [%x[out_ptr], #0x30]\n"
"str q24, [%x[out_ptr], #0x40]\n"
"str q21, [%x[out_ptr], #0x50]\n"
"str q18, [%x[out_ptr], #0x60]\n"
"str q16, [%x[out_ptr], #0x70]\n"
- "subs %x[width], %x[width], #0x10\n"
- "cmp %x[width], #0x10\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 2b\n"
"3:" // Main loop skip
@@ -122,37 +122,37 @@ void interleave_block<8, 8, VLType::None, false>(
"ldr d24, [x26], #0x8\n"
"ldr d25, [x25], #0x8\n"
"ldr d21, [x24], #0x8\n"
- "ldr d22, [x23], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
"ldr d18, [x22], #0x8\n"
- "ldr d19, [x21], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
"ldr d16, [x20], #0x8\n"
"tbz %x[width], #2, 5f\n"
"ld1 { v27.s }[2], [x27], #0x4\n"
"ld1 { v24.s }[2], [x26], #0x4\n"
"ld1 { v25.s }[2], [x25], #0x4\n"
"ld1 { v21.s }[2], [x24], #0x4\n"
- "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
"ld1 { v18.s }[2], [x22], #0x4\n"
- "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v20.s }[2], [x21], #0x4\n"
"ld1 { v16.s }[2], [x20], #0x4\n"
"tbz %x[width], #1, 4f\n"
"ld1 { v27.h }[6], [x27], #0x2\n"
+ "mov x19, #0x2\n"
"ld1 { v24.h }[6], [x26], #0x2\n"
"ld1 { v25.h }[6], [x25], #0x2\n"
"ld1 { v21.h }[6], [x24], #0x2\n"
- "ld1 { v22.h }[6], [x23], #0x2\n"
+ "ld1 { v23.h }[6], [x23], #0x2\n"
"ld1 { v18.h }[6], [x22], #0x2\n"
- "ld1 { v19.h }[6], [x21], #0x2\n"
+ "ld1 { v20.h }[6], [x21], #0x2\n"
"ld1 { v16.h }[6], [x20], #0x2\n"
- "mov x19, #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v27.b }[14], [x27]\n"
"ld1 { v24.b }[14], [x26]\n"
"ld1 { v25.b }[14], [x25]\n"
"ld1 { v21.b }[14], [x24]\n"
- "ld1 { v22.b }[14], [x23]\n"
+ "ld1 { v23.b }[14], [x23]\n"
"ld1 { v18.b }[14], [x22]\n"
- "ld1 { v19.b }[14], [x21]\n"
+ "ld1 { v20.b }[14], [x21]\n"
"ld1 { v16.b }[14], [x20]\n"
"b 11f\n"
"4:" // odd_loads_1_12
@@ -162,30 +162,30 @@ void interleave_block<8, 8, VLType::None, false>(
"ld1 { v24.b }[12], [x26]\n"
"ld1 { v25.b }[12], [x25]\n"
"ld1 { v21.b }[12], [x24]\n"
- "ld1 { v22.b }[12], [x23]\n"
+ "ld1 { v23.b }[12], [x23]\n"
"ld1 { v18.b }[12], [x22]\n"
- "ld1 { v19.b }[12], [x21]\n"
+ "ld1 { v20.b }[12], [x21]\n"
"ld1 { v16.b }[12], [x20]\n"
"b 11f\n"
"5:" // odd_loads_2_8
"tbz %x[width], #1, 6f\n"
"ld1 { v27.h }[4], [x27], #0x2\n"
"ld1 { v24.h }[4], [x26], #0x2\n"
+ "mov x19, #0x2\n"
"ld1 { v25.h }[4], [x25], #0x2\n"
"ld1 { v21.h }[4], [x24], #0x2\n"
- "ld1 { v22.h }[4], [x23], #0x2\n"
+ "ld1 { v23.h }[4], [x23], #0x2\n"
"ld1 { v18.h }[4], [x22], #0x2\n"
- "ld1 { v19.h }[4], [x21], #0x2\n"
+ "ld1 { v20.h }[4], [x21], #0x2\n"
"ld1 { v16.h }[4], [x20], #0x2\n"
- "mov x19, #0x2\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v27.b }[10], [x27]\n"
"ld1 { v24.b }[10], [x26]\n"
"ld1 { v25.b }[10], [x25]\n"
"ld1 { v21.b }[10], [x24]\n"
- "ld1 { v22.b }[10], [x23]\n"
+ "ld1 { v23.b }[10], [x23]\n"
"ld1 { v18.b }[10], [x22]\n"
- "ld1 { v19.b }[10], [x21]\n"
+ "ld1 { v20.b }[10], [x21]\n"
"ld1 { v16.b }[10], [x20]\n"
"b 11f\n"
"6:" // odd_loads_1_8
@@ -193,13 +193,13 @@ void interleave_block<8, 8, VLType::None, false>(
"tbz %x[width], #0, 11f\n"
"ld1 { v27.b }[8], [x27]\n"
"ld1 { v24.b }[8], [x26]\n"
+ "mov x19, #0x2\n"
"ld1 { v25.b }[8], [x25]\n"
"ld1 { v21.b }[8], [x24]\n"
- "ld1 { v22.b }[8], [x23]\n"
+ "ld1 { v23.b }[8], [x23]\n"
"ld1 { v18.b }[8], [x22]\n"
- "ld1 { v19.b }[8], [x21]\n"
+ "ld1 { v20.b }[8], [x21]\n"
"ld1 { v16.b }[8], [x20]\n"
- "mov x19, #0x2\n"
"b 11f\n"
"7:" // odd_loads_4_0
"tbz %x[width], #2, 9f\n"
@@ -207,28 +207,28 @@ void interleave_block<8, 8, VLType::None, false>(
"ldr s24, [x26], #0x4\n"
"ldr s25, [x25], #0x4\n"
"ldr s21, [x24], #0x4\n"
- "ldr s22, [x23], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
"ldr s18, [x22], #0x4\n"
- "ldr s19, [x21], #0x4\n"
+ "ldr s20, [x21], #0x4\n"
"ldr s16, [x20], #0x4\n"
"tbz %x[width], #1, 8f\n"
"ld1 { v27.h }[2], [x27], #0x2\n"
+ "mov x19, #0x1\n"
"ld1 { v24.h }[2], [x26], #0x2\n"
"ld1 { v25.h }[2], [x25], #0x2\n"
"ld1 { v21.h }[2], [x24], #0x2\n"
- "ld1 { v22.h }[2], [x23], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
"ld1 { v18.h }[2], [x22], #0x2\n"
- "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v20.h }[2], [x21], #0x2\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "mov x19, #0x1\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v27.b }[6], [x27]\n"
"ld1 { v24.b }[6], [x26]\n"
"ld1 { v25.b }[6], [x25]\n"
"ld1 { v21.b }[6], [x24]\n"
- "ld1 { v22.b }[6], [x23]\n"
+ "ld1 { v23.b }[6], [x23]\n"
"ld1 { v18.b }[6], [x22]\n"
- "ld1 { v19.b }[6], [x21]\n"
+ "ld1 { v20.b }[6], [x21]\n"
"ld1 { v16.b }[6], [x20]\n"
"b 11f\n"
"8:" // odd_loads_1_4
@@ -238,66 +238,66 @@ void interleave_block<8, 8, VLType::None, false>(
"ld1 { v24.b }[4], [x26]\n"
"ld1 { v25.b }[4], [x25]\n"
"ld1 { v21.b }[4], [x24]\n"
- "ld1 { v22.b }[4], [x23]\n"
+ "ld1 { v23.b }[4], [x23]\n"
"ld1 { v18.b }[4], [x22]\n"
- "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v20.b }[4], [x21]\n"
"ld1 { v16.b }[4], [x20]\n"
"b 11f\n"
"9:" // odd_loads_2_0
"tbz %x[width], #1, 10f\n"
"ldr h27, [x27], #0x2\n"
"ldr h24, [x26], #0x2\n"
+ "mov x19, #0x1\n"
"ldr h25, [x25], #0x2\n"
"ldr h21, [x24], #0x2\n"
- "ldr h22, [x23], #0x2\n"
+ "ldr h23, [x23], #0x2\n"
"ldr h18, [x22], #0x2\n"
- "ldr h19, [x21], #0x2\n"
+ "ldr h20, [x21], #0x2\n"
"ldr h16, [x20], #0x2\n"
- "mov x19, #0x1\n"
"tbz %x[width], #0, 11f\n"
"ld1 { v27.b }[2], [x27]\n"
"ld1 { v24.b }[2], [x26]\n"
"ld1 { v25.b }[2], [x25]\n"
"ld1 { v21.b }[2], [x24]\n"
- "ld1 { v22.b }[2], [x23]\n"
+ "ld1 { v23.b }[2], [x23]\n"
"ld1 { v18.b }[2], [x22]\n"
- "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v20.b }[2], [x21]\n"
"ld1 { v16.b }[2], [x20]\n"
"b 11f\n"
"10:" // odd_loads_1_0
"ldr b27, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr b24, [x26, #0x0]\n"
"ldr b25, [x25, #0x0]\n"
"ldr b21, [x24, #0x0]\n"
- "ldr b22, [x23, #0x0]\n"
+ "ldr b23, [x23, #0x0]\n"
"ldr b18, [x22, #0x0]\n"
- "ldr b19, [x21, #0x0]\n"
+ "ldr b20, [x21, #0x0]\n"
"ldr b16, [x20, #0x0]\n"
- "mov x19, #0x1\n"
"11:" // Odd load end
"zip1 v26.2d, v27.2d, v24.2d\n"
- "subs x19, x19, #0x1\n"
- "zip1 v23.2d, v25.2d, v21.2d\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip1 v20.2d, v22.2d, v18.2d\n"
- "str q23, [%x[out_ptr], #0x10]\n"
- "zip1 v17.2d, v19.2d, v16.2d\n"
- "str q20, [%x[out_ptr], #0x20]\n"
+ "zip1 v22.2d, v25.2d, v21.2d\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v19.2d, v23.2d, v18.2d\n"
+ "str q22, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v20.2d, v16.2d\n"
+ "str q19, [%x[out_ptr], #0x20]\n"
"str q17, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 12f\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "zip2 v21.2d, v25.2d, v21.2d\n"
"str q24, [%x[out_ptr], #0x0]\n"
- "zip2 v18.2d, v22.2d, v18.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "zip2 v18.2d, v23.2d, v18.2d\n"
"str q21, [%x[out_ptr], #0x10]\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
+ "zip2 v16.2d, v20.2d, v16.2d\n"
"str q18, [%x[out_ptr], #0x20]\n"
"str q16, [%x[out_ptr], #0x30]\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"12:" // Odds skip
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
index 2317ece790..3550830fc3 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -108,48 +108,48 @@ void interleave_block<8, 8, VLType::None, true>(
"mov x19, #0x0\n"
"4:" // no_accumulate_16
"ldr q27, [x27], #0x10\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "add x19, x19, #0x1\n"
"ldr q24, [x26], #0x10\n"
"zip1 v26.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x26, #0x70]\n"
"ldr q25, [x25], #0x10\n"
+ "subs %x[width], %x[width], #0x10\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x25, #0x70]\n"
"ldr q21, [x24], #0x10\n"
+ "sadalp v5.8h, v26.16b\n"
"zip1 v23.2d, v25.2d, v21.2d\n"
- "prfm pldl1keep, [x24, #0x70]\n"
"ldr q22, [x23], #0x10\n"
+ "cmp %x[width], #0x10\n"
"zip2 v21.2d, v25.2d, v21.2d\n"
- "prfm pldl1keep, [x23, #0x70]\n"
"ldr q18, [x22], #0x10\n"
+ "sadalp v4.8h, v23.16b\n"
"zip1 v20.2d, v22.2d, v18.2d\n"
- "prfm pldl1keep, [x22, #0x70]\n"
"ldr q19, [x21], #0x10\n"
+ "sadalp v5.8h, v24.16b\n"
"zip2 v18.2d, v22.2d, v18.2d\n"
- "prfm pldl1keep, [x21, #0x70]\n"
"ldr q16, [x20], #0x10\n"
+ "sadalp v3.8h, v20.16b\n"
"zip1 v17.2d, v19.2d, v16.2d\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "sadalp v4.8h, v21.16b\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "sadalp v2.8h, v17.16b\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "sadalp v3.8h, v18.16b\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "sadalp v2.8h, v16.16b\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
"prfm pldl1keep, [x20, #0x70]\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
- "sadalp v5.8h, v26.16b\n"
"str q23, [%x[out_ptr], #0x10]\n"
- "sadalp v4.8h, v23.16b\n"
"str q20, [%x[out_ptr], #0x20]\n"
- "sadalp v3.8h, v20.16b\n"
"str q17, [%x[out_ptr], #0x30]\n"
- "sadalp v2.8h, v17.16b\n"
"str q24, [%x[out_ptr], #0x40]\n"
- "sadalp v5.8h, v24.16b\n"
"str q21, [%x[out_ptr], #0x50]\n"
- "sadalp v4.8h, v21.16b\n"
"str q18, [%x[out_ptr], #0x60]\n"
- "sadalp v3.8h, v18.16b\n"
"str q16, [%x[out_ptr], #0x70]\n"
- "sadalp v2.8h, v16.16b\n"
- "add x19, x19, #0x1\n"
- "subs %x[width], %x[width], #0x10\n"
- "cmp %x[width], #0x10\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 3b\n"
"5:" // Main loop skip
@@ -174,6 +174,7 @@ void interleave_block<8, 8, VLType::None, true>(
"ld1 { v16.s }[2], [x20], #0x4\n"
"tbz %x[width], #1, 6f\n"
"ld1 { v27.h }[6], [x27], #0x2\n"
+ "mov x19, #0x2\n"
"ld1 { v24.h }[6], [x26], #0x2\n"
"ld1 { v25.h }[6], [x25], #0x2\n"
"ld1 { v21.h }[6], [x24], #0x2\n"
@@ -181,7 +182,6 @@ void interleave_block<8, 8, VLType::None, true>(
"ld1 { v18.h }[6], [x22], #0x2\n"
"ld1 { v19.h }[6], [x21], #0x2\n"
"ld1 { v16.h }[6], [x20], #0x2\n"
- "mov x19, #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[14], [x27]\n"
"ld1 { v24.b }[14], [x26]\n"
@@ -208,13 +208,13 @@ void interleave_block<8, 8, VLType::None, true>(
"tbz %x[width], #1, 8f\n"
"ld1 { v27.h }[4], [x27], #0x2\n"
"ld1 { v24.h }[4], [x26], #0x2\n"
+ "mov x19, #0x2\n"
"ld1 { v25.h }[4], [x25], #0x2\n"
"ld1 { v21.h }[4], [x24], #0x2\n"
"ld1 { v22.h }[4], [x23], #0x2\n"
"ld1 { v18.h }[4], [x22], #0x2\n"
"ld1 { v19.h }[4], [x21], #0x2\n"
"ld1 { v16.h }[4], [x20], #0x2\n"
- "mov x19, #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[10], [x27]\n"
"ld1 { v24.b }[10], [x26]\n"
@@ -230,13 +230,13 @@ void interleave_block<8, 8, VLType::None, true>(
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[8], [x27]\n"
"ld1 { v24.b }[8], [x26]\n"
+ "mov x19, #0x2\n"
"ld1 { v25.b }[8], [x25]\n"
"ld1 { v21.b }[8], [x24]\n"
"ld1 { v22.b }[8], [x23]\n"
"ld1 { v18.b }[8], [x22]\n"
"ld1 { v19.b }[8], [x21]\n"
"ld1 { v16.b }[8], [x20]\n"
- "mov x19, #0x2\n"
"b 13f\n"
"9:" // odd_loads_4_0
"tbz %x[width], #2, 11f\n"
@@ -250,6 +250,7 @@ void interleave_block<8, 8, VLType::None, true>(
"ldr s16, [x20], #0x4\n"
"tbz %x[width], #1, 10f\n"
"ld1 { v27.h }[2], [x27], #0x2\n"
+ "mov x19, #0x1\n"
"ld1 { v24.h }[2], [x26], #0x2\n"
"ld1 { v25.h }[2], [x25], #0x2\n"
"ld1 { v21.h }[2], [x24], #0x2\n"
@@ -257,7 +258,6 @@ void interleave_block<8, 8, VLType::None, true>(
"ld1 { v18.h }[2], [x22], #0x2\n"
"ld1 { v19.h }[2], [x21], #0x2\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "mov x19, #0x1\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[6], [x27]\n"
"ld1 { v24.b }[6], [x26]\n"
@@ -284,13 +284,13 @@ void interleave_block<8, 8, VLType::None, true>(
"tbz %x[width], #1, 12f\n"
"ldr h27, [x27], #0x2\n"
"ldr h24, [x26], #0x2\n"
+ "mov x19, #0x1\n"
"ldr h25, [x25], #0x2\n"
"ldr h21, [x24], #0x2\n"
"ldr h22, [x23], #0x2\n"
"ldr h18, [x22], #0x2\n"
"ldr h19, [x21], #0x2\n"
"ldr h16, [x20], #0x2\n"
- "mov x19, #0x1\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[2], [x27]\n"
"ld1 { v24.b }[2], [x26]\n"
@@ -303,6 +303,7 @@ void interleave_block<8, 8, VLType::None, true>(
"b 13f\n"
"12:" // odd_loads_1_0
"ldr b27, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr b24, [x26, #0x0]\n"
"ldr b25, [x25, #0x0]\n"
"ldr b21, [x24, #0x0]\n"
@@ -310,31 +311,30 @@ void interleave_block<8, 8, VLType::None, true>(
"ldr b18, [x22, #0x0]\n"
"ldr b19, [x21, #0x0]\n"
"ldr b16, [x20, #0x0]\n"
- "mov x19, #0x1\n"
"13:" // Odd load end
"zip1 v26.2d, v27.2d, v24.2d\n"
- "subs x19, x19, #0x1\n"
- "zip1 v23.2d, v25.2d, v21.2d\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip1 v20.2d, v22.2d, v18.2d\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
"sadalp v5.8h, v26.16b\n"
- "zip1 v17.2d, v19.2d, v16.2d\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
"str q23, [%x[out_ptr], #0x10]\n"
"sadalp v4.8h, v23.16b\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
"str q20, [%x[out_ptr], #0x20]\n"
"sadalp v3.8h, v20.16b\n"
"str q17, [%x[out_ptr], #0x30]\n"
"sadalp v2.8h, v17.16b\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 14f\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "zip2 v21.2d, v25.2d, v21.2d\n"
"str q24, [%x[out_ptr], #0x0]\n"
- "zip2 v18.2d, v22.2d, v18.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
"sadalp v5.8h, v24.16b\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
"str q21, [%x[out_ptr], #0x10]\n"
"sadalp v4.8h, v21.16b\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
"str q18, [%x[out_ptr], #0x20]\n"
"sadalp v3.8h, v18.16b\n"
"str q16, [%x[out_ptr], #0x30]\n"
@@ -352,7 +352,7 @@ void interleave_block<8, 8, VLType::None, true>(
"add v0.4s, v0.4s, v28.4s\n"
"str q0, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
index 07164d6b24..454260ef1a 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -108,48 +108,48 @@ void interleave_block<8, 8, VLType::None, true>(
"mov x19, #0x0\n"
"4:" // no_accumulate_16
"ldr q27, [x27], #0x10\n"
- "prfm pldl1keep, [x27, #0x70]\n"
+ "add x19, x19, #0x1\n"
"ldr q24, [x26], #0x10\n"
"zip1 v26.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x26, #0x70]\n"
"ldr q25, [x25], #0x10\n"
+ "subs %x[width], %x[width], #0x10\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "prfm pldl1keep, [x25, #0x70]\n"
"ldr q21, [x24], #0x10\n"
+ "uadalp v5.8h, v26.16b\n"
"zip1 v23.2d, v25.2d, v21.2d\n"
- "prfm pldl1keep, [x24, #0x70]\n"
"ldr q22, [x23], #0x10\n"
+ "cmp %x[width], #0x10\n"
"zip2 v21.2d, v25.2d, v21.2d\n"
- "prfm pldl1keep, [x23, #0x70]\n"
"ldr q18, [x22], #0x10\n"
+ "uadalp v4.8h, v23.16b\n"
"zip1 v20.2d, v22.2d, v18.2d\n"
- "prfm pldl1keep, [x22, #0x70]\n"
"ldr q19, [x21], #0x10\n"
+ "uadalp v5.8h, v24.16b\n"
"zip2 v18.2d, v22.2d, v18.2d\n"
- "prfm pldl1keep, [x21, #0x70]\n"
"ldr q16, [x20], #0x10\n"
+ "uadalp v3.8h, v20.16b\n"
"zip1 v17.2d, v19.2d, v16.2d\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "uadalp v4.8h, v21.16b\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "uadalp v2.8h, v17.16b\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "uadalp v3.8h, v18.16b\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "uadalp v2.8h, v16.16b\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
"prfm pldl1keep, [x20, #0x70]\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
- "uadalp v5.8h, v26.16b\n"
"str q23, [%x[out_ptr], #0x10]\n"
- "uadalp v4.8h, v23.16b\n"
"str q20, [%x[out_ptr], #0x20]\n"
- "uadalp v3.8h, v20.16b\n"
"str q17, [%x[out_ptr], #0x30]\n"
- "uadalp v2.8h, v17.16b\n"
"str q24, [%x[out_ptr], #0x40]\n"
- "uadalp v5.8h, v24.16b\n"
"str q21, [%x[out_ptr], #0x50]\n"
- "uadalp v4.8h, v21.16b\n"
"str q18, [%x[out_ptr], #0x60]\n"
- "uadalp v3.8h, v18.16b\n"
"str q16, [%x[out_ptr], #0x70]\n"
- "uadalp v2.8h, v16.16b\n"
- "add x19, x19, #0x1\n"
- "subs %x[width], %x[width], #0x10\n"
- "cmp %x[width], #0x10\n"
"add %x[out_ptr], %x[out_ptr], #0x80\n"
"bge 3b\n"
"5:" // Main loop skip
@@ -174,6 +174,7 @@ void interleave_block<8, 8, VLType::None, true>(
"ld1 { v16.s }[2], [x20], #0x4\n"
"tbz %x[width], #1, 6f\n"
"ld1 { v27.h }[6], [x27], #0x2\n"
+ "mov x19, #0x2\n"
"ld1 { v24.h }[6], [x26], #0x2\n"
"ld1 { v25.h }[6], [x25], #0x2\n"
"ld1 { v21.h }[6], [x24], #0x2\n"
@@ -181,7 +182,6 @@ void interleave_block<8, 8, VLType::None, true>(
"ld1 { v18.h }[6], [x22], #0x2\n"
"ld1 { v19.h }[6], [x21], #0x2\n"
"ld1 { v16.h }[6], [x20], #0x2\n"
- "mov x19, #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[14], [x27]\n"
"ld1 { v24.b }[14], [x26]\n"
@@ -208,13 +208,13 @@ void interleave_block<8, 8, VLType::None, true>(
"tbz %x[width], #1, 8f\n"
"ld1 { v27.h }[4], [x27], #0x2\n"
"ld1 { v24.h }[4], [x26], #0x2\n"
+ "mov x19, #0x2\n"
"ld1 { v25.h }[4], [x25], #0x2\n"
"ld1 { v21.h }[4], [x24], #0x2\n"
"ld1 { v22.h }[4], [x23], #0x2\n"
"ld1 { v18.h }[4], [x22], #0x2\n"
"ld1 { v19.h }[4], [x21], #0x2\n"
"ld1 { v16.h }[4], [x20], #0x2\n"
- "mov x19, #0x2\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[10], [x27]\n"
"ld1 { v24.b }[10], [x26]\n"
@@ -230,13 +230,13 @@ void interleave_block<8, 8, VLType::None, true>(
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[8], [x27]\n"
"ld1 { v24.b }[8], [x26]\n"
+ "mov x19, #0x2\n"
"ld1 { v25.b }[8], [x25]\n"
"ld1 { v21.b }[8], [x24]\n"
"ld1 { v22.b }[8], [x23]\n"
"ld1 { v18.b }[8], [x22]\n"
"ld1 { v19.b }[8], [x21]\n"
"ld1 { v16.b }[8], [x20]\n"
- "mov x19, #0x2\n"
"b 13f\n"
"9:" // odd_loads_4_0
"tbz %x[width], #2, 11f\n"
@@ -250,6 +250,7 @@ void interleave_block<8, 8, VLType::None, true>(
"ldr s16, [x20], #0x4\n"
"tbz %x[width], #1, 10f\n"
"ld1 { v27.h }[2], [x27], #0x2\n"
+ "mov x19, #0x1\n"
"ld1 { v24.h }[2], [x26], #0x2\n"
"ld1 { v25.h }[2], [x25], #0x2\n"
"ld1 { v21.h }[2], [x24], #0x2\n"
@@ -257,7 +258,6 @@ void interleave_block<8, 8, VLType::None, true>(
"ld1 { v18.h }[2], [x22], #0x2\n"
"ld1 { v19.h }[2], [x21], #0x2\n"
"ld1 { v16.h }[2], [x20], #0x2\n"
- "mov x19, #0x1\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[6], [x27]\n"
"ld1 { v24.b }[6], [x26]\n"
@@ -284,13 +284,13 @@ void interleave_block<8, 8, VLType::None, true>(
"tbz %x[width], #1, 12f\n"
"ldr h27, [x27], #0x2\n"
"ldr h24, [x26], #0x2\n"
+ "mov x19, #0x1\n"
"ldr h25, [x25], #0x2\n"
"ldr h21, [x24], #0x2\n"
"ldr h22, [x23], #0x2\n"
"ldr h18, [x22], #0x2\n"
"ldr h19, [x21], #0x2\n"
"ldr h16, [x20], #0x2\n"
- "mov x19, #0x1\n"
"tbz %x[width], #0, 13f\n"
"ld1 { v27.b }[2], [x27]\n"
"ld1 { v24.b }[2], [x26]\n"
@@ -303,6 +303,7 @@ void interleave_block<8, 8, VLType::None, true>(
"b 13f\n"
"12:" // odd_loads_1_0
"ldr b27, [x27, #0x0]\n"
+ "mov x19, #0x1\n"
"ldr b24, [x26, #0x0]\n"
"ldr b25, [x25, #0x0]\n"
"ldr b21, [x24, #0x0]\n"
@@ -310,31 +311,30 @@ void interleave_block<8, 8, VLType::None, true>(
"ldr b18, [x22, #0x0]\n"
"ldr b19, [x21, #0x0]\n"
"ldr b16, [x20, #0x0]\n"
- "mov x19, #0x1\n"
"13:" // Odd load end
"zip1 v26.2d, v27.2d, v24.2d\n"
- "subs x19, x19, #0x1\n"
- "zip1 v23.2d, v25.2d, v21.2d\n"
"str q26, [%x[out_ptr], #0x0]\n"
- "zip1 v20.2d, v22.2d, v18.2d\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
"uadalp v5.8h, v26.16b\n"
- "zip1 v17.2d, v19.2d, v16.2d\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
"str q23, [%x[out_ptr], #0x10]\n"
"uadalp v4.8h, v23.16b\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
"str q20, [%x[out_ptr], #0x20]\n"
"uadalp v3.8h, v20.16b\n"
"str q17, [%x[out_ptr], #0x30]\n"
"uadalp v2.8h, v17.16b\n"
+ "subs x19, x19, #0x1\n"
"add %x[out_ptr], %x[out_ptr], #0x40\n"
"beq 14f\n"
"zip2 v24.2d, v27.2d, v24.2d\n"
- "zip2 v21.2d, v25.2d, v21.2d\n"
"str q24, [%x[out_ptr], #0x0]\n"
- "zip2 v18.2d, v22.2d, v18.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
"uadalp v5.8h, v24.16b\n"
- "zip2 v16.2d, v19.2d, v16.2d\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
"str q21, [%x[out_ptr], #0x10]\n"
"uadalp v4.8h, v21.16b\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
"str q18, [%x[out_ptr], #0x20]\n"
"uadalp v3.8h, v18.16b\n"
"str q16, [%x[out_ptr], #0x30]\n"
@@ -352,7 +352,7 @@ void interleave_block<8, 8, VLType::None, true>(
"add v0.4s, v0.4s, v28.4s\n"
"str q0, [%x[out_ptr], #0x10]\n"
"add %x[out_ptr], %x[out_ptr], #0x20\n"
- : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
: [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
index 24e258e4b8..cccedc6b9c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
@@ -62,14 +62,13 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 24> transforms = {};
- static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
- return { 7.16, 1.14, 0.67 };
+ return { 7.16, 1.14, 0.67 };
default:
- return { 12.67, 3.98, 1.16 };
+ return { 12.67, 3.98, 1.16 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
index d390108b11..fca96f6028 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -37,7 +37,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void a64_hybrid_bf16fp32_dot_6x16( ARGLIST );
@@ -74,7 +73,6 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_bf16fp32_dot_6x16;
-
cls_a64_hybrid_bf16fp32_dot_6x16(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
index 85944e9f6a..afb06dedea 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -96,238 +96,232 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 181f\n"
+ "bge 176f\n"
"cmp %x[M], #0x4\n"
- "bgt 145f\n"
- "beq 109f\n"
+ "bgt 141f\n"
+ "beq 106f\n"
"cmp %x[M], #0x2\n"
- "bgt 73f\n"
- "beq 37f\n"
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[bias]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
- "cbz x14, 4f\n"
- "ldr q8, [x14, #0x0]\n"
- "ldr q9, [x14, #0x10]\n"
- "ldr q10, [x14, #0x20]\n"
- "ldr q11, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
- "b 15f\n"
- "4:" // Height 1: no bias
- "tbz %x[flags], #0, 14f\n"
- "cmp x16, #0x10\n"
- "bge 13f\n"
- "tbz x16, #3, 8f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "tbz x16, #2, 6f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "tbz x16, #1, 5f\n"
+ "bgt 71f\n"
+ "beq 36f\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[bias]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "cbz x9, 3f\n"
+ "ldr q8, [x9, #0x0]\n"
+ "ldr q9, [x9, #0x10]\n"
+ "ldr q10, [x9, #0x20]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
+ "b 14f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 13f\n"
+ "cmp x11, #0x10\n"
+ "bge 12f\n"
+ "tbz x11, #3, 7f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "tbz x11, #2, 5f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "tbz x11, #1, 4f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "tbz x16, #0, 12f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "b 12f\n"
- "5:" // Height 1: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "tbz x11, #0, 11f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "b 11f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 12f\n"
- "ldr s11, [x13, #0x0]\n"
- "b 12f\n"
- "6:" // Height 1: Partial accumulate: partial_2_8
- "tbz x16, #1, 7f\n"
- "ldr d10, [x13], #0x8\n"
+ "tbz x11, #0, 11f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "b 11f\n"
+ "5:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x11, #1, 6f\n"
+ "ldr d10, [x28], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 12f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "b 12f\n"
- "7:" // Height 1: Partial accumulate: partial_1_8
+ "tbz x11, #0, 11f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "b 11f\n"
+ "6:" // Height 1: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 12f\n"
- "ldr s10, [x13, #0x0]\n"
- "b 12f\n"
- "8:" // Height 1: Partial accumulate: partial_4_0
- "tbz x16, #2, 10f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "tbz x16, #1, 9f\n"
+ "tbz x11, #0, 11f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "b 11f\n"
+ "7:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x11, #2, 9f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "tbz x11, #1, 8f\n"
+ "ldr d9, [x28], #0x8\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "tbz x16, #0, 12f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "b 12f\n"
- "9:" // Height 1: Partial accumulate: partial_1_4
+ "tbz x11, #0, 11f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "b 11f\n"
+ "8:" // Height 1: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 12f\n"
- "ldr s9, [x13, #0x0]\n"
- "b 12f\n"
- "10:" // Height 1: Partial accumulate: partial_2_0
- "tbz x16, #1, 11f\n"
- "ldr d8, [x13], #0x8\n"
+ "tbz x11, #0, 11f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "b 11f\n"
+ "9:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x11, #1, 10f\n"
+ "ldr d8, [x28], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 12f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "b 12f\n"
- "11:" // Height 1: Partial accumulate: partial_1_0
+ "tbz x11, #0, 11f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "b 11f\n"
+ "10:" // Height 1: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "12:" // Height 1: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "b 15f\n"
- "13:" // Height 1: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "b 15f\n"
- "14:" // Height 1: no accumulate
+ "11:" // Height 1: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 14f\n"
+ "12:" // Height 1: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "b 14f\n"
+ "13:" // Height 1: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
"movi v11.16b, #0x0\n"
- "15:" // Height 1: setup done
- "mov x12, #0x0\n"
- "16:" // Height 1: String loop
+ "14:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 17f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 16f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 18f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 17f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "b 18f\n"
- "17:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "18:" // Height 1: input setup done
- "cmp x11, #0x8\n"
- "blt 21f\n"
- "cmp x11, #0x10\n"
+ "add x25, x25, x19, LSL #1\n"
+ "b 17f\n"
+ "16:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "17:" // Height 1: input setup done
+ "cmp x26, #0x8\n"
"blt 20f\n"
- "19:" // Height 1: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "blt 19f\n"
+ "18:" // Height 1: Multiply loop: Main loop head
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "cmp x26, #0x10\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q6, [x10, #0x40]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
- "sub x11, x11, #0x8\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
- "cmp x11, #0x10\n"
+ "ldr q7, [x10, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
- "bge 19b\n"
- "20:" // Height 1: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "bge 18b\n"
+ "19:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
- "21:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 25f\n"
- "cmp x11, #0x2\n"
- "blt 23f\n"
- "22:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "20:" // Height 1: Multiply loop: Main loop skip
+ "cbz x26, 24f\n"
+ "cmp x26, #0x2\n"
+ "blt 22f\n"
+ "21:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x2\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "cmp x26, #0x2\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "sub x11, x11, #0x2\n"
- "add x15, x15, #0x40\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- "cmp x11, #0x2\n"
- "bge 22b\n"
- "cbz x11, 25f\n"
- "23:" // Height 1: Multiply loop: Skip odd blocks
- "ldr h0, [x10, #0x0]\n"
- "24:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
+ "bge 21b\n"
+ "cbz x26, 24f\n"
+ "22:" // Height 1: Multiply loop: Skip odd blocks
+ "ldr h0, [x25, #0x0]\n"
+ "23:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
- "25:" // Height 1: Multiply loop: No odd multiplies
+ "24:" // Height 1: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 16b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "tbz %x[flags], #1, 26f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 15b\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "tbz %x[flags], #1, 25f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -340,178 +334,170 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"fmax v9.4s, v9.4s, v1.4s\n"
"fmax v10.4s, v10.4s, v1.4s\n"
"fmax v11.4s, v11.4s, v1.4s\n"
- "26:" // Height 1: No activation
- "cmp x16, #0x10\n"
- "bge 35f\n"
- "tbz x16, #3, 30f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "tbz x16, #2, 28f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "tbz x16, #1, 27f\n"
- "str d11, [x13], #0x8\n"
- "tbz x16, #0, 34f\n"
- "st1 { v11.s }[2], [x13]\n"
- "b 34f\n"
- "27:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x16, #0, 34f\n"
- "str s11, [x13, #0x0]\n"
- "b 34f\n"
- "28:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x16, #1, 29f\n"
- "str d10, [x13], #0x8\n"
- "tbz x16, #0, 34f\n"
- "st1 { v10.s }[2], [x13]\n"
- "b 34f\n"
- "29:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x16, #0, 34f\n"
- "str s10, [x13, #0x0]\n"
- "b 34f\n"
- "30:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x16, #2, 32f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "tbz x16, #1, 31f\n"
- "str d9, [x13], #0x8\n"
- "tbz x16, #0, 34f\n"
- "st1 { v9.s }[2], [x13]\n"
- "b 34f\n"
- "31:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x16, #0, 34f\n"
- "str s9, [x13, #0x0]\n"
- "b 34f\n"
- "32:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x16, #1, 33f\n"
- "str d8, [x13], #0x8\n"
- "tbz x16, #0, 34f\n"
- "st1 { v8.s }[2], [x13]\n"
- "b 34f\n"
- "33:" // Height 1: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "34:" // Height 1: Partial direct writeback: Done
- "b 36f\n"
- "35:" // Height 1: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "add x13, x13, #0x40\n"
- "36:" // Height 1: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 3b\n"
- "b 218f\n"
- "37:" // Height 2
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 38f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #2\n"
- "b 39f\n"
- "38:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "39:" // Height 2: Column loop
- "cbz x14, 40f\n"
- "ldr q8, [x14, #0x0]\n"
+ "25:" // Height 1: No activation
+ "cmp x11, #0x10\n"
+ "bge 34f\n"
+ "tbz x11, #3, 29f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "tbz x11, #2, 27f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "tbz x11, #1, 26f\n"
+ "str d11, [x28], #0x8\n"
+ "tbz x11, #0, 33f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "b 33f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 33f\n"
+ "str s11, [x28, #0x0]\n"
+ "b 33f\n"
+ "27:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 28f\n"
+ "str d10, [x28], #0x8\n"
+ "tbz x11, #0, 33f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "b 33f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 33f\n"
+ "str s10, [x28, #0x0]\n"
+ "b 33f\n"
+ "29:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 31f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "tbz x11, #1, 30f\n"
+ "str d9, [x28], #0x8\n"
+ "tbz x11, #0, 33f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "b 33f\n"
+ "30:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 33f\n"
+ "str s9, [x28, #0x0]\n"
+ "b 33f\n"
+ "31:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 32f\n"
+ "str d8, [x28], #0x8\n"
+ "tbz x11, #0, 33f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "b 33f\n"
+ "32:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "33:" // Height 1: Partial direct writeback: Done
+ "b 35f\n"
+ "34:" // Height 1: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "35:" // Height 1: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 2b\n"
+ "b 212f\n"
+ "36:" // Height 2
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "37:" // Height 2: Column loop
+ "cbz x9, 38f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q9, [x9, #0x10]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v13.16b, v9.16b\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v14.16b, v10.16b\n"
- "add x14, x14, #0x40\n"
"mov v15.16b, v11.16b\n"
- "b 51f\n"
- "40:" // Height 2: no bias
- "tbz %x[flags], #0, 50f\n"
- "cmp x16, #0x10\n"
- "bge 49f\n"
- "tbz x16, #3, 44f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "tbz x16, #2, 42f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "tbz x16, #1, 41f\n"
+ "b 49f\n"
+ "38:" // Height 2: no bias
+ "tbz %x[flags], #0, 48f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "bge 47f\n"
+ "tbz x11, #3, 42f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "tbz x11, #2, 40f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "tbz x11, #1, 39f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "tbz x16, #0, 48f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "b 48f\n"
- "41:" // Height 2: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "tbz x11, #0, 46f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "b 46f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 48f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "b 48f\n"
- "42:" // Height 2: Partial accumulate: partial_2_8
- "tbz x16, #1, 43f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
+ "tbz x11, #0, 46f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "b 46f\n"
+ "40:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x11, #1, 41f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 48f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "b 48f\n"
- "43:" // Height 2: Partial accumulate: partial_1_8
+ "tbz x11, #0, 46f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "b 46f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 48f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "b 48f\n"
- "44:" // Height 2: Partial accumulate: partial_4_0
- "tbz x16, #2, 46f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "tbz x16, #1, 45f\n"
+ "tbz x11, #0, 46f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "b 46f\n"
+ "42:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x11, #2, 44f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "tbz x11, #1, 43f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "tbz x16, #0, 48f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "b 48f\n"
- "45:" // Height 2: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "tbz x11, #0, 46f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "b 46f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 48f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "b 48f\n"
- "46:" // Height 2: Partial accumulate: partial_2_0
- "tbz x16, #1, 47f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
+ "tbz x11, #0, 46f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "b 46f\n"
+ "44:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x11, #1, 45f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 48f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "b 48f\n"
- "47:" // Height 2: Partial accumulate: partial_1_0
+ "tbz x11, #0, 46f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "b 46f\n"
+ "45:" // Height 2: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "48:" // Height 2: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "b 51f\n"
- "49:" // Height 2: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "b 51f\n"
- "50:" // Height 2: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "46:" // Height 2: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 49f\n"
+ "47:" // Height 2: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "b 49f\n"
+ "48:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -520,195 +506,197 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"movi v13.16b, #0x0\n"
"movi v14.16b, #0x0\n"
"movi v15.16b, #0x0\n"
- "51:" // Height 2: setup done
- "mov x12, #0x0\n"
- "52:" // Height 2: String loop
+ "49:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "50:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 53f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 54f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 52f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "b 54f\n"
- "53:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "54:" // Height 2: input setup done
- "cmp x11, #0x8\n"
- "blt 57f\n"
- "cmp x11, #0x10\n"
- "blt 56f\n"
- "55:" // Height 2: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 52f\n"
+ "51:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "52:" // Height 2: input setup done
+ "cmp x26, #0x8\n"
+ "blt 55f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 54f\n"
+ "53:" // Height 2: Multiply loop: Main loop head
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "cmp x26, #0x10\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "sub x11, x11, #0x8\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
- "cmp x11, #0x10\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
- "bge 55b\n"
- "56:" // Height 2: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "bge 53b\n"
+ "54:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
- "57:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 61f\n"
- "cmp x11, #0x2\n"
- "blt 59f\n"
- "58:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "55:" // Height 2: Multiply loop: Main loop skip
+ "cbz x26, 59f\n"
+ "cmp x26, #0x2\n"
+ "blt 57f\n"
+ "56:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x2\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x2\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "sub x11, x11, #0x2\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "cmp x11, #0x2\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- "bge 58b\n"
- "cbz x11, 61f\n"
- "59:" // Height 2: Multiply loop: Skip odd blocks
- "ldr h0, [x10, #0x0]\n"
- "ldr h1, [x28, #0x0]\n"
- "60:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
+ "bge 56b\n"
+ "cbz x26, 59f\n"
+ "57:" // Height 2: Multiply loop: Skip odd blocks
+ "ldr h0, [x25, #0x0]\n"
+ "ldr h1, [x24, #0x0]\n"
+ "58:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
- "61:" // Height 2: Multiply loop: No odd multiplies
+ "59:" // Height 2: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 52b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "tbz %x[flags], #1, 62f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 50b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "tbz %x[flags], #1, 60f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -729,227 +717,215 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"fmax v14.4s, v14.4s, v1.4s\n"
"fmin v15.4s, v15.4s, v0.4s\n"
"fmax v15.4s, v15.4s, v1.4s\n"
- "62:" // Height 2: No activation
- "cmp x16, #0x10\n"
- "bge 71f\n"
- "tbz x16, #3, 66f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "tbz x16, #2, 64f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "tbz x16, #1, 63f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "tbz x16, #0, 70f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "b 70f\n"
- "63:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x16, #0, 70f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "b 70f\n"
- "64:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x16, #1, 65f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "tbz x16, #0, 70f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "b 70f\n"
- "65:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x16, #0, 70f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "b 70f\n"
- "66:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x16, #2, 68f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "tbz x16, #1, 67f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "tbz x16, #0, 70f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "b 70f\n"
- "67:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x16, #0, 70f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "b 70f\n"
- "68:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x16, #1, 69f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "tbz x16, #0, 70f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
+ "60:" // Height 2: No activation
+ "cmp x11, #0x10\n"
+ "bge 69f\n"
+ "tbz x11, #3, 64f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "tbz x11, #2, 62f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "tbz x11, #1, 61f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "tbz x11, #0, 68f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "b 68f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 68f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "b 68f\n"
+ "62:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 63f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "tbz x11, #0, 68f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "b 68f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 68f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "b 68f\n"
+ "64:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 66f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "tbz x11, #1, 65f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "tbz x11, #0, 68f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "b 68f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 68f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "b 68f\n"
+ "66:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 67f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "tbz x11, #0, 68f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "b 68f\n"
+ "67:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "68:" // Height 2: Partial direct writeback: Done
"b 70f\n"
- "69:" // Height 2: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "70:" // Height 2: Partial direct writeback: Done
- "b 72f\n"
- "71:" // Height 2: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "72:" // Height 2: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 39b\n"
- "b 218f\n"
- "73:" // Height 3
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 74f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 75f\n"
- "74:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "75:" // Height 3: Column loop
- "cbz x14, 76f\n"
- "ldr q8, [x14, #0x0]\n"
+ "69:" // Height 2: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "70:" // Height 2: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 37b\n"
+ "b 212f\n"
+ "71:" // Height 3
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "72:" // Height 3: Column loop
+ "cbz x9, 73f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q10, [x9, #0x20]\n"
+ "ldr q11, [x9, #0x30]\n"
"mov v13.16b, v9.16b\n"
- "add x14, x14, #0x40\n"
+ "add x9, x9, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
- "b 87f\n"
- "76:" // Height 3: no bias
- "tbz %x[flags], #0, 86f\n"
- "cmp x16, #0x10\n"
- "bge 85f\n"
- "tbz x16, #3, 80f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "tbz x16, #2, 78f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "tbz x16, #1, 77f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "tbz x16, #0, 84f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
"b 84f\n"
- "77:" // Height 3: Partial accumulate: partial_1_12
+ "73:" // Height 3: no bias
+ "tbz %x[flags], #0, 83f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "bge 82f\n"
+ "tbz x11, #3, 77f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "tbz x11, #2, 75f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "tbz x11, #1, 74f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "tbz x11, #0, 81f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "b 81f\n"
+ "74:" // Height 3: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 84f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "b 84f\n"
- "78:" // Height 3: Partial accumulate: partial_2_8
- "tbz x16, #1, 79f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
+ "tbz x11, #0, 81f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "b 81f\n"
+ "75:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x11, #1, 76f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 84f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "b 84f\n"
- "79:" // Height 3: Partial accumulate: partial_1_8
+ "ldr d18, [x23], #0x8\n"
+ "tbz x11, #0, 81f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "b 81f\n"
+ "76:" // Height 3: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 84f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "b 84f\n"
- "80:" // Height 3: Partial accumulate: partial_4_0
- "tbz x16, #2, 82f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "tbz x16, #1, 81f\n"
+ "tbz x11, #0, 81f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "b 81f\n"
+ "77:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x11, #2, 79f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "tbz x11, #1, 78f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "tbz x16, #0, 84f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "b 84f\n"
- "81:" // Height 3: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "tbz x11, #0, 81f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "b 81f\n"
+ "78:" // Height 3: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 84f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "b 84f\n"
- "82:" // Height 3: Partial accumulate: partial_2_0
- "tbz x16, #1, 83f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
+ "tbz x11, #0, 81f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "b 81f\n"
+ "79:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x11, #1, 80f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 84f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "b 84f\n"
- "83:" // Height 3: Partial accumulate: partial_1_0
+ "ldr d16, [x23], #0x8\n"
+ "tbz x11, #0, 81f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "b 81f\n"
+ "80:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "84:" // Height 3: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "b 87f\n"
- "85:" // Height 3: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "b 87f\n"
- "86:" // Height 3: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "81:" // Height 3: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 84f\n"
+ "82:" // Height 3: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "b 84f\n"
+ "83:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -962,247 +938,250 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"movi v17.16b, #0x0\n"
"movi v18.16b, #0x0\n"
"movi v19.16b, #0x0\n"
- "87:" // Height 3: setup done
- "mov x12, #0x0\n"
- "88:" // Height 3: String loop
+ "84:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "85:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 89f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 86f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 90f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 87f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
- "b 90f\n"
- "89:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "90:" // Height 3: input setup done
- "cmp x11, #0x8\n"
- "blt 93f\n"
- "cmp x11, #0x10\n"
- "blt 92f\n"
- "91:" // Height 3: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "b 87f\n"
+ "86:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "87:" // Height 3: input setup done
+ "cmp x26, #0x8\n"
+ "blt 90f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 89f\n"
+ "88:" // Height 3: Multiply loop: Main loop head
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "cmp x26, #0x10\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "sub x11, x11, #0x8\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
- "cmp x11, #0x10\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
- "bge 91b\n"
- "92:" // Height 3: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "bge 88b\n"
+ "89:" // Height 3: Multiply loop: Single iteration only
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
- "93:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 97f\n"
- "cmp x11, #0x2\n"
- "blt 95f\n"
- "94:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "90:" // Height 3: Multiply loop: Main loop skip
+ "cbz x26, 94f\n"
+ "cmp x26, #0x2\n"
+ "blt 92f\n"
+ "91:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x2\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x2\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "sub x11, x11, #0x2\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "cmp x11, #0x2\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- "bge 94b\n"
- "cbz x11, 97f\n"
- "95:" // Height 3: Multiply loop: Skip odd blocks
- "ldr h0, [x10, #0x0]\n"
- "ldr h1, [x28, #0x0]\n"
- "ldr h2, [x26, #0x0]\n"
- "96:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
+ "bge 91b\n"
+ "cbz x26, 94f\n"
+ "92:" // Height 3: Multiply loop: Skip odd blocks
+ "ldr h0, [x25, #0x0]\n"
+ "ldr h1, [x24, #0x0]\n"
+ "ldr h2, [x23, #0x0]\n"
+ "93:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
- "97:" // Height 3: Multiply loop: No odd multiplies
+ "94:" // Height 3: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 88b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "tbz %x[flags], #1, 98f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 85b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 95f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1231,137 +1210,120 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"fmin v19.4s, v19.4s, v0.4s\n"
"fmax v18.4s, v18.4s, v1.4s\n"
"fmax v19.4s, v19.4s, v1.4s\n"
- "98:" // Height 3: No activation
- "cmp x16, #0x10\n"
- "bge 107f\n"
- "tbz x16, #3, 102f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "tbz x16, #2, 100f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "tbz x16, #1, 99f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "tbz x16, #0, 106f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "b 106f\n"
- "99:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x16, #0, 106f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "b 106f\n"
- "100:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x16, #1, 101f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "tbz x16, #0, 106f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "b 106f\n"
- "101:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x16, #0, 106f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "b 106f\n"
- "102:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x16, #2, 104f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "tbz x16, #1, 103f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "tbz x16, #0, 106f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "b 106f\n"
- "103:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x16, #0, 106f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "b 106f\n"
- "104:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x16, #1, 105f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "tbz x16, #0, 106f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "b 106f\n"
- "105:" // Height 3: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "106:" // Height 3: Partial direct writeback: Done
- "b 108f\n"
- "107:" // Height 3: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "108:" // Height 3: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 75b\n"
- "b 218f\n"
- "109:" // Height 4
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 110f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 111f\n"
- "110:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "111:" // Height 4: Column loop
- "cbz x14, 112f\n"
- "ldr q8, [x14, #0x0]\n"
+ "95:" // Height 3: No activation
+ "cmp x11, #0x10\n"
+ "bge 104f\n"
+ "tbz x11, #3, 99f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "tbz x11, #2, 97f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "tbz x11, #1, 96f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "tbz x11, #0, 103f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "b 103f\n"
+ "96:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 103f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "b 103f\n"
+ "97:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 98f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "tbz x11, #0, 103f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "b 103f\n"
+ "98:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 103f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "b 103f\n"
+ "99:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 101f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "tbz x11, #1, 100f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "tbz x11, #0, 103f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "b 103f\n"
+ "100:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 103f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "b 103f\n"
+ "101:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 102f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "tbz x11, #0, 103f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "b 103f\n"
+ "102:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "103:" // Height 3: Partial direct writeback: Done
+ "b 105f\n"
+ "104:" // Height 3: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "105:" // Height 3: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 72b\n"
+ "b 212f\n"
+ "106:" // Height 4
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "107:" // Height 4: Column loop
+ "cbz x9, 108f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v20.16b, v8.16b\n"
- "ldr q11, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v13.16b, v9.16b\n"
"mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
@@ -1371,136 +1333,137 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
- "b 123f\n"
- "112:" // Height 4: no bias
- "tbz %x[flags], #0, 122f\n"
- "cmp x16, #0x10\n"
- "bge 121f\n"
- "tbz x16, #3, 116f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "tbz x16, #2, 114f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "tbz x16, #1, 113f\n"
+ "b 119f\n"
+ "108:" // Height 4: no bias
+ "tbz %x[flags], #0, 118f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "bge 117f\n"
+ "tbz x11, #3, 112f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "tbz x11, #2, 110f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "tbz x11, #1, 109f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "b 120f\n"
- "113:" // Height 4: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "tbz x11, #0, 116f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "b 116f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 120f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "b 120f\n"
- "114:" // Height 4: Partial accumulate: partial_2_8
- "tbz x16, #1, 115f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
+ "tbz x11, #0, 116f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "b 116f\n"
+ "110:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x11, #1, 111f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "b 120f\n"
- "115:" // Height 4: Partial accumulate: partial_1_8
+ "ldr d18, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "tbz x11, #0, 116f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "b 116f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 120f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "b 120f\n"
- "116:" // Height 4: Partial accumulate: partial_4_0
- "tbz x16, #2, 118f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "tbz x16, #1, 117f\n"
+ "tbz x11, #0, 116f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "b 116f\n"
+ "112:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x11, #2, 114f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "tbz x11, #1, 113f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "b 120f\n"
- "117:" // Height 4: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "tbz x11, #0, 116f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "b 116f\n"
+ "113:" // Height 4: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 120f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "b 120f\n"
- "118:" // Height 4: Partial accumulate: partial_2_0
- "tbz x16, #1, 119f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
+ "tbz x11, #0, 116f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "b 116f\n"
+ "114:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x11, #1, 115f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "b 120f\n"
- "119:" // Height 4: Partial accumulate: partial_1_0
+ "ldr d16, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "tbz x11, #0, 116f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
+ "b 116f\n"
+ "115:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "120:" // Height 4: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "b 123f\n"
- "121:" // Height 4: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "b 123f\n"
- "122:" // Height 4: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
+ "116:" // Height 4: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 119f\n"
+ "117:" // Height 4: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "b 119f\n"
+ "118:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -1517,220 +1480,220 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"movi v21.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"movi v23.16b, #0x0\n"
- "123:" // Height 4: setup done
- "mov x12, #0x0\n"
- "124:" // Height 4: String loop
+ "119:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "120:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 125f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 121f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 126f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 122f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
- "b 126f\n"
- "125:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "126:" // Height 4: input setup done
- "cmp x11, #0x8\n"
- "blt 129f\n"
- "cmp x11, #0x10\n"
- "blt 128f\n"
- "127:" // Height 4: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x23, x23, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 122f\n"
+ "121:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "122:" // Height 4: input setup done
+ "cmp x26, #0x8\n"
+ "blt 125f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 124f\n"
+ "123:" // Height 4: Multiply loop: Main loop head
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x10\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x11, x11, #0x8\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "cmp x11, #0x10\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
- "bge 127b\n"
- "128:" // Height 4: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "bge 123b\n"
+ "124:" // Height 4: Multiply loop: Single iteration only
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
@@ -1739,31 +1702,31 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
- "129:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 133f\n"
- "cmp x11, #0x2\n"
- "blt 131f\n"
- "130:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "125:" // Height 4: Multiply loop: Main loop skip
+ "cbz x26, 129f\n"
+ "cmp x26, #0x2\n"
+ "blt 127f\n"
+ "126:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x2\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x2\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "sub x11, x11, #0x2\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "cmp x11, #0x2\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
@@ -1771,28 +1734,28 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- "bge 130b\n"
- "cbz x11, 133f\n"
- "131:" // Height 4: Multiply loop: Skip odd blocks
- "ldr h0, [x10, #0x0]\n"
- "ldr h1, [x28, #0x0]\n"
- "ldr h2, [x26, #0x0]\n"
- "ldr h3, [x24, #0x0]\n"
- "132:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
+ "bge 126b\n"
+ "cbz x26, 129f\n"
+ "127:" // Height 4: Multiply loop: Skip odd blocks
+ "ldr h0, [x25, #0x0]\n"
+ "ldr h1, [x24, #0x0]\n"
+ "ldr h2, [x23, #0x0]\n"
+ "ldr h3, [x22, #0x0]\n"
+ "128:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
@@ -1800,16 +1763,20 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
- "133:" // Height 4: Multiply loop: No odd multiplies
+ "129:" // Height 4: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 124b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "tbz %x[flags], #1, 134f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 120b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "tbz %x[flags], #1, 130f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1846,162 +1813,141 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"fmax v21.4s, v21.4s, v1.4s\n"
"fmax v22.4s, v22.4s, v1.4s\n"
"fmax v23.4s, v23.4s, v1.4s\n"
- "134:" // Height 4: No activation
- "cmp x16, #0x10\n"
- "bge 143f\n"
- "tbz x16, #3, 138f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "tbz x16, #2, 136f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "tbz x16, #1, 135f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "tbz x16, #0, 142f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "b 142f\n"
- "135:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x16, #0, 142f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "b 142f\n"
- "136:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x16, #1, 137f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "tbz x16, #0, 142f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "b 142f\n"
- "137:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x16, #0, 142f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "b 142f\n"
- "138:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x16, #2, 140f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "tbz x16, #1, 139f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "tbz x16, #0, 142f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "b 142f\n"
- "139:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x16, #0, 142f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "b 142f\n"
- "140:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x16, #1, 141f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "tbz x16, #0, 142f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "b 142f\n"
- "141:" // Height 4: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "142:" // Height 4: Partial direct writeback: Done
- "b 144f\n"
- "143:" // Height 4: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "144:" // Height 4: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 111b\n"
- "b 218f\n"
- "145:" // Height 5
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 146f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 147f\n"
- "146:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "147:" // Height 5: Column loop
- "cbz x14, 148f\n"
- "ldr q8, [x14, #0x0]\n"
+ "130:" // Height 4: No activation
+ "cmp x11, #0x10\n"
+ "bge 139f\n"
+ "tbz x11, #3, 134f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "tbz x11, #2, 132f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "tbz x11, #1, 131f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "tbz x11, #0, 138f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "b 138f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 138f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "b 138f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 133f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "tbz x11, #0, 138f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
+ "b 138f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 138f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
+ "b 138f\n"
+ "134:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 136f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "tbz x11, #1, 135f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "tbz x11, #0, 138f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "b 138f\n"
+ "135:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 138f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
+ "b 138f\n"
+ "136:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 137f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "tbz x11, #0, 138f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "b 138f\n"
+ "137:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
+ "138:" // Height 4: Partial direct writeback: Done
+ "b 140f\n"
+ "139:" // Height 4: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "140:" // Height 4: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 107b\n"
+ "b 212f\n"
+ "141:" // Height 5
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "142:" // Height 5: Column loop
+ "cbz x9, 143f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v20.16b, v8.16b\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v24.16b, v8.16b\n"
- "add x14, x14, #0x40\n"
"mov v13.16b, v9.16b\n"
"mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
@@ -2014,157 +1960,158 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"mov v25.16b, v9.16b\n"
"mov v26.16b, v10.16b\n"
"mov v27.16b, v11.16b\n"
- "b 159f\n"
- "148:" // Height 5: no bias
- "tbz %x[flags], #0, 158f\n"
- "cmp x16, #0x10\n"
- "bge 157f\n"
- "tbz x16, #3, 152f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "tbz x16, #2, 150f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "tbz x16, #1, 149f\n"
+ "b 154f\n"
+ "143:" // Height 5: no bias
+ "tbz %x[flags], #0, 153f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "bge 152f\n"
+ "tbz x11, #3, 147f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v24.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v25.4s }, [x21], #0x10\n"
+ "tbz x11, #2, 145f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
+ "tbz x11, #1, 144f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "tbz x16, #0, 156f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "b 156f\n"
- "149:" // Height 5: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
+ "tbz x11, #0, 151f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "ld1 { v27.s }[2], [x21]\n"
+ "b 151f\n"
+ "144:" // Height 5: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 156f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "b 156f\n"
- "150:" // Height 5: Partial accumulate: partial_2_8
- "tbz x16, #1, 151f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "tbz x11, #0, 151f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "ldr s27, [x21, #0x0]\n"
+ "b 151f\n"
+ "145:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x11, #1, 146f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 156f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "b 156f\n"
- "151:" // Height 5: Partial accumulate: partial_1_8
+ "ldr d18, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "tbz x11, #0, 151f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v26.s }[2], [x21]\n"
+ "b 151f\n"
+ "146:" // Height 5: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 156f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "b 156f\n"
- "152:" // Height 5: Partial accumulate: partial_4_0
- "tbz x16, #2, 154f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "tbz x16, #1, 153f\n"
+ "tbz x11, #0, 151f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s26, [x21, #0x0]\n"
+ "b 151f\n"
+ "147:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x11, #2, 149f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v24.4s }, [x21], #0x10\n"
+ "tbz x11, #1, 148f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "tbz x16, #0, 156f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "b 156f\n"
- "153:" // Height 5: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d25, [x21], #0x8\n"
+ "tbz x11, #0, 151f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "ld1 { v25.s }[2], [x21]\n"
+ "b 151f\n"
+ "148:" // Height 5: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 156f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "b 156f\n"
- "154:" // Height 5: Partial accumulate: partial_2_0
- "tbz x16, #1, 155f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "tbz x11, #0, 151f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "ldr s25, [x21, #0x0]\n"
+ "b 151f\n"
+ "149:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x11, #1, 150f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 156f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "b 156f\n"
- "155:" // Height 5: Partial accumulate: partial_1_0
+ "ldr d16, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "tbz x11, #0, 151f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
+ "ld1 { v24.s }[2], [x21]\n"
+ "b 151f\n"
+ "150:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "156:" // Height 5: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "b 159f\n"
- "157:" // Height 5: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "b 159f\n"
- "158:" // Height 5: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
+ "ldr s24, [x21, #0x0]\n"
+ "151:" // Height 5: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 154f\n"
+ "152:" // Height 5: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "b 154f\n"
+ "153:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -2185,260 +2132,260 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
"movi v27.16b, #0x0\n"
- "159:" // Height 5: setup done
- "mov x12, #0x0\n"
- "160:" // Height 5: String loop
+ "154:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "155:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 161f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 156f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 162f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 157f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
"add x22, x22, x19, LSL #1\n"
- "b 162f\n"
- "161:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "add x22, x24, x19, LSL #1\n"
- "162:" // Height 5: input setup done
- "cmp x11, #0x8\n"
- "blt 165f\n"
- "cmp x11, #0x10\n"
- "blt 164f\n"
- "163:" // Height 5: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x21, x21, x19, LSL #1\n"
+ "b 157f\n"
+ "156:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "157:" // Height 5: input setup done
+ "cmp x26, #0x8\n"
+ "blt 160f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 159f\n"
+ "158:" // Height 5: Multiply loop: Main loop head
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "cmp x26, #0x10\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "sub x11, x11, #0x8\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "cmp x11, #0x10\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- "add x15, x15, #0x100\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ "ldr q3, [x22, #0x0]\n"
".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
- "bge 163b\n"
- "164:" // Height 5: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "bge 158b\n"
+ "159:" // Height 5: Multiply loop: Single iteration only
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- "add x15, x15, #0x100\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
@@ -2448,34 +2395,34 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
- "165:" // Height 5: Multiply loop: Main loop skip
- "cbz x11, 169f\n"
- "cmp x11, #0x2\n"
- "blt 167f\n"
- "166:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "160:" // Height 5: Multiply loop: Main loop skip
+ "cbz x26, 164f\n"
+ "cmp x26, #0x2\n"
+ "blt 162f\n"
+ "161:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x2\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x2\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "sub x11, x11, #0x2\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "cmp x11, #0x2\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
@@ -2485,31 +2432,31 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
- "bge 166b\n"
- "cbz x11, 169f\n"
- "167:" // Height 5: Multiply loop: Skip odd blocks
- "ldr h0, [x10, #0x0]\n"
- "ldr h1, [x28, #0x0]\n"
- "ldr h2, [x26, #0x0]\n"
- "ldr h3, [x24, #0x0]\n"
- "ldr h4, [x22, #0x0]\n"
- "168:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
+ "bge 161b\n"
+ "cbz x26, 164f\n"
+ "162:" // Height 5: Multiply loop: Skip odd blocks
+ "ldr h0, [x25, #0x0]\n"
+ "ldr h1, [x24, #0x0]\n"
+ "ldr h2, [x23, #0x0]\n"
+ "ldr h3, [x22, #0x0]\n"
+ "ldr h4, [x21, #0x0]\n"
+ "163:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
@@ -2519,17 +2466,22 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
- "169:" // Height 5: Multiply loop: No odd multiplies
+ "164:" // Height 5: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 160b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 155b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "tbz %x[flags], #1, 170f\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 165f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -2574,193 +2526,169 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"fmax v26.4s, v26.4s, v1.4s\n"
"fmin v27.4s, v27.4s, v0.4s\n"
"fmax v27.4s, v27.4s, v1.4s\n"
- "170:" // Height 5: No activation
- "cmp x16, #0x10\n"
- "bge 179f\n"
- "tbz x16, #3, 174f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "tbz x16, #2, 172f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "tbz x16, #1, 171f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "tbz x16, #0, 178f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "st1 { v27.s }[2], [x23]\n"
- "b 178f\n"
- "171:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x16, #0, 178f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "b 178f\n"
- "172:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x16, #1, 173f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "tbz x16, #0, 178f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v26.s }[2], [x23]\n"
- "b 178f\n"
- "173:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x16, #0, 178f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "b 178f\n"
- "174:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x16, #2, 176f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "tbz x16, #1, 175f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "tbz x16, #0, 178f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v25.s }[2], [x23]\n"
- "b 178f\n"
- "175:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x16, #0, 178f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "b 178f\n"
- "176:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x16, #1, 177f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x16, #0, 178f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v24.s }[2], [x23]\n"
- "b 178f\n"
- "177:" // Height 5: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "178:" // Height 5: Partial direct writeback: Done
- "b 180f\n"
- "179:" // Height 5: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "180:" // Height 5: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 147b\n"
- "b 218f\n"
- "181:" // Height 6
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 182f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 183f\n"
- "182:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "183:" // Height 6: Column loop
- "cbz x14, 184f\n"
- "ldr q8, [x14, #0x0]\n"
+ "165:" // Height 5: No activation
+ "cmp x11, #0x10\n"
+ "bge 174f\n"
+ "tbz x11, #3, 169f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "st1 { v25.4s }, [x21], #0x10\n"
+ "tbz x11, #2, 167f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
+ "tbz x11, #1, 166f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
+ "tbz x11, #0, 173f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
+ "b 173f\n"
+ "166:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 173f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "str s27, [x21, #0x0]\n"
+ "b 173f\n"
+ "167:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 168f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
+ "tbz x11, #0, 173f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "b 173f\n"
+ "168:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 173f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
+ "str s26, [x21, #0x0]\n"
+ "b 173f\n"
+ "169:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 171f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "tbz x11, #1, 170f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
+ "tbz x11, #0, 173f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
+ "b 173f\n"
+ "170:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 173f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
+ "str s25, [x21, #0x0]\n"
+ "b 173f\n"
+ "171:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 172f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
+ "tbz x11, #0, 173f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "st1 { v24.s }[2], [x21]\n"
+ "b 173f\n"
+ "172:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
+ "str s24, [x21, #0x0]\n"
+ "173:" // Height 5: Partial direct writeback: Done
+ "b 175f\n"
+ "174:" // Height 5: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q25, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
+ "175:" // Height 5: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 142b\n"
+ "b 212f\n"
+ "176:" // Height 6
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0x18\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "177:" // Height 6: Column loop
+ "cbz x9, 178f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v20.16b, v8.16b\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v24.16b, v8.16b\n"
- "add x14, x14, #0x40\n"
"mov v28.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
"mov v21.16b, v9.16b\n"
@@ -2772,178 +2700,179 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"mov v29.16b, v9.16b\n"
"mov v30.16b, v10.16b\n"
"mov v31.16b, v11.16b\n"
- "b 195f\n"
- "184:" // Height 6: no bias
- "tbz %x[flags], #0, 194f\n"
- "cmp x16, #0x10\n"
- "bge 193f\n"
- "tbz x16, #3, 188f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x21], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x21], #0x10\n"
- "tbz x16, #2, 186f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x21], #0x10\n"
- "tbz x16, #1, 185f\n"
+ "b 189f\n"
+ "178:" // Height 6: no bias
+ "tbz %x[flags], #0, 188f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "bge 187f\n"
+ "tbz x11, #3, 182f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v24.4s }, [x21], #0x10\n"
+ "ld1 { v28.4s }, [x20], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v25.4s }, [x21], #0x10\n"
+ "ld1 { v29.4s }, [x20], #0x10\n"
+ "tbz x11, #2, 180f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
+ "ld1 { v30.4s }, [x20], #0x10\n"
+ "tbz x11, #1, 179f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x21], #0x8\n"
- "tbz x16, #0, 192f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x21]\n"
- "b 192f\n"
- "185:" // Height 6: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
+ "ldr d31, [x20], #0x8\n"
+ "tbz x11, #0, 186f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "ld1 { v27.s }[2], [x21]\n"
+ "ld1 { v31.s }[2], [x20]\n"
+ "b 186f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 192f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s31, [x21, #0x0]\n"
- "b 192f\n"
- "186:" // Height 6: Partial accumulate: partial_2_8
- "tbz x16, #1, 187f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "tbz x11, #0, 186f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "ldr s27, [x21, #0x0]\n"
+ "ldr s31, [x20, #0x0]\n"
+ "b 186f\n"
+ "180:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x11, #1, 181f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 192f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x21]\n"
- "b 192f\n"
- "187:" // Height 6: Partial accumulate: partial_1_8
+ "ldr d18, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
+ "tbz x11, #0, 186f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v26.s }[2], [x21]\n"
+ "ld1 { v30.s }[2], [x20]\n"
+ "b 186f\n"
+ "181:" // Height 6: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 192f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s30, [x21, #0x0]\n"
- "b 192f\n"
- "188:" // Height 6: Partial accumulate: partial_4_0
- "tbz x16, #2, 190f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x21], #0x10\n"
- "tbz x16, #1, 189f\n"
+ "tbz x11, #0, 186f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s26, [x21, #0x0]\n"
+ "ldr s30, [x20, #0x0]\n"
+ "b 186f\n"
+ "182:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x11, #2, 184f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v24.4s }, [x21], #0x10\n"
+ "ld1 { v28.4s }, [x20], #0x10\n"
+ "tbz x11, #1, 183f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x21], #0x8\n"
- "tbz x16, #0, 192f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x21]\n"
- "b 192f\n"
- "189:" // Height 6: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d25, [x21], #0x8\n"
+ "ldr d29, [x20], #0x8\n"
+ "tbz x11, #0, 186f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "ld1 { v25.s }[2], [x21]\n"
+ "ld1 { v29.s }[2], [x20]\n"
+ "b 186f\n"
+ "183:" // Height 6: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 192f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s29, [x21, #0x0]\n"
- "b 192f\n"
- "190:" // Height 6: Partial accumulate: partial_2_0
- "tbz x16, #1, 191f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x21], #0x8\n"
+ "tbz x11, #0, 186f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "ldr s25, [x21, #0x0]\n"
+ "ldr s29, [x20, #0x0]\n"
+ "b 186f\n"
+ "184:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x11, #1, 185f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 192f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x21]\n"
- "b 192f\n"
- "191:" // Height 6: Partial accumulate: partial_1_0
+ "ldr d16, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "ldr d28, [x20], #0x8\n"
+ "tbz x11, #0, 186f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
+ "ld1 { v24.s }[2], [x21]\n"
+ "ld1 { v28.s }[2], [x20]\n"
+ "b 186f\n"
+ "185:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s28, [x21, #0x0]\n"
- "192:" // Height 6: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "sub x21, x21, x19\n"
- "b 195f\n"
- "193:" // Height 6: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x21, #0x0]\n"
- "ldr q29, [x21, #0x10]\n"
- "ldr q30, [x21, #0x20]\n"
- "ldr q31, [x21, #0x30]\n"
- "b 195f\n"
- "194:" // Height 6: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
+ "ldr s24, [x21, #0x0]\n"
+ "ldr s28, [x20, #0x0]\n"
+ "186:" // Height 6: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 189f\n"
+ "187:" // Height 6: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x20, #0x0]\n"
+ "ldr q29, [x20, #0x10]\n"
+ "ldr q30, [x20, #0x20]\n"
+ "ldr q31, [x20, #0x30]\n"
+ "b 189f\n"
+ "188:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -2968,299 +2897,299 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"movi v29.16b, #0x0\n"
"movi v30.16b, #0x0\n"
"movi v31.16b, #0x0\n"
- "195:" // Height 6: setup done
- "mov x12, #0x0\n"
- "196:" // Height 6: String loop
+ "189:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "190:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 197f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 191f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 198f\n"
+ "cbnz x27, 192f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
"add x22, x22, x19, LSL #1\n"
+ "add x21, x21, x19, LSL #1\n"
"add x20, x20, x19, LSL #1\n"
- "b 198f\n"
- "197:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "add x22, x24, x19, LSL #1\n"
- "add x20, x22, x19, LSL #1\n"
- "198:" // Height 6: input setup done
- "cmp x11, #0x8\n"
- "blt 201f\n"
- "cmp x11, #0x10\n"
- "blt 200f\n"
- "199:" // Height 6: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
+ "b 192f\n"
+ "191:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "add x20, x21, x19, LSL #1\n"
+ "192:" // Height 6: input setup done
+ "cmp x26, #0x8\n"
+ "blt 195f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 194f\n"
+ "193:" // Height 6: Multiply loop: Main loop head
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x20, x20, #0x10\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
- "add x20, x20, #0x10\n"
- ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "sub x11, x11, #0x8\n"
+ "cmp x26, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
- "cmp x11, #0x10\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- "add x15, x15, #0x100\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
".inst 0x4f65f8de // bfdot v30.4s, v6.8h, v5.h[3]\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ "ldr q3, [x22, #0x0]\n"
".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ "ldr q4, [x21, #0x0]\n"
".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n"
- "bge 199b\n"
- "200:" // Height 6: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "bge 193b\n"
+ "194:" // Height 6: Multiply loop: Single iteration only
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
- ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
- ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"add x22, x22, #0x10\n"
+ ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q6, [x10, #0x20]\n"
"add x20, x20, #0x10\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
- "add x15, x15, #0x100\n"
".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
@@ -3272,37 +3201,37 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n"
- "201:" // Height 6: Multiply loop: Main loop skip
- "cbz x11, 205f\n"
- "cmp x11, #0x2\n"
- "blt 203f\n"
- "202:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
+ "195:" // Height 6: Multiply loop: Main loop skip
+ "cbz x26, 199f\n"
+ "cmp x26, #0x2\n"
+ "blt 197f\n"
+ "196:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x2\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x2\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
"ldr s5, [x20], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
- "sub x11, x11, #0x2\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
- "cmp x11, #0x2\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
@@ -3314,34 +3243,34 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
- "bge 202b\n"
- "cbz x11, 205f\n"
- "203:" // Height 6: Multiply loop: Skip odd blocks
- "ldr h0, [x10, #0x0]\n"
- "ldr h1, [x28, #0x0]\n"
- "ldr h2, [x26, #0x0]\n"
- "ldr h3, [x24, #0x0]\n"
- "ldr h4, [x22, #0x0]\n"
+ "bge 196b\n"
+ "cbz x26, 199f\n"
+ "197:" // Height 6: Multiply loop: Skip odd blocks
+ "ldr h0, [x25, #0x0]\n"
+ "ldr h1, [x24, #0x0]\n"
+ "ldr h2, [x23, #0x0]\n"
+ "ldr h3, [x22, #0x0]\n"
+ "ldr h4, [x21, #0x0]\n"
"ldr h5, [x20, #0x0]\n"
- "204:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x15, #0x0]\n"
+ "198:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x10, #0x0]\n"
".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
@@ -3353,18 +3282,24 @@ void a64_hybrid_bf16fp32_dot_6x16 (
".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
- "205:" // Height 6: Multiply loop: No odd multiplies
+ "199:" // Height 6: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 196b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 190b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "tbz %x[flags], #1, 206f\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "tbz %x[flags], #1, 200f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -3417,185 +3352,180 @@ void a64_hybrid_bf16fp32_dot_6x16 (
"fmin v31.4s, v31.4s, v0.4s\n"
"fmax v30.4s, v30.4s, v1.4s\n"
"fmax v31.4s, v31.4s, v1.4s\n"
- "206:" // Height 6: No activation
- "cmp x16, #0x10\n"
- "bge 215f\n"
- "tbz x16, #3, 210f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x21], #0x10\n"
- "st1 { v29.4s }, [x21], #0x10\n"
- "tbz x16, #2, 208f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x21], #0x10\n"
- "tbz x16, #1, 207f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x21], #0x8\n"
- "tbz x16, #0, 214f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x21]\n"
- "b 214f\n"
- "207:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x16, #0, 214f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s31, [x21, #0x0]\n"
- "b 214f\n"
- "208:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x16, #1, 209f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x21], #0x8\n"
- "tbz x16, #0, 214f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x21]\n"
- "b 214f\n"
- "209:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x16, #0, 214f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s30, [x21, #0x0]\n"
- "b 214f\n"
- "210:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x16, #2, 212f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x21], #0x10\n"
- "tbz x16, #1, 211f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x21], #0x8\n"
- "tbz x16, #0, 214f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x21]\n"
- "b 214f\n"
- "211:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x16, #0, 214f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s29, [x21, #0x0]\n"
- "b 214f\n"
- "212:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x16, #1, 213f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x21], #0x8\n"
- "tbz x16, #0, 214f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x21]\n"
- "b 214f\n"
- "213:" // Height 6: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s28, [x21, #0x0]\n"
- "214:" // Height 6: Partial direct writeback: Done
- "b 216f\n"
- "215:" // Height 6: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x21, #0x0]\n"
- "str q29, [x21, #0x10]\n"
- "str q30, [x21, #0x20]\n"
- "str q31, [x21, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "add x21, x21, #0x40\n"
- "216:" // Height 6: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 183b\n"
+ "200:" // Height 6: No activation
+ "cmp x11, #0x10\n"
+ "bge 209f\n"
+ "tbz x11, #3, 204f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "st1 { v25.4s }, [x21], #0x10\n"
+ "st1 { v28.4s }, [x20], #0x10\n"
+ "st1 { v29.4s }, [x20], #0x10\n"
+ "tbz x11, #2, 202f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
+ "st1 { v30.4s }, [x20], #0x10\n"
+ "tbz x11, #1, 201f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
+ "str d31, [x20], #0x8\n"
+ "tbz x11, #0, 208f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 208f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 208f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "str s27, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
+ "b 208f\n"
+ "202:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 203f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
+ "str d30, [x20], #0x8\n"
+ "tbz x11, #0, 208f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "st1 { v30.s }[2], [x20]\n"
+ "b 208f\n"
+ "203:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 208f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
+ "str s26, [x21, #0x0]\n"
+ "str s30, [x20, #0x0]\n"
+ "b 208f\n"
+ "204:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 206f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "st1 { v28.4s }, [x20], #0x10\n"
+ "tbz x11, #1, 205f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
+ "str d29, [x20], #0x8\n"
+ "tbz x11, #0, 208f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
+ "st1 { v29.s }[2], [x20]\n"
+ "b 208f\n"
+ "205:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 208f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
+ "str s25, [x21, #0x0]\n"
+ "str s29, [x20, #0x0]\n"
+ "b 208f\n"
+ "206:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 207f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
+ "str d28, [x20], #0x8\n"
+ "tbz x11, #0, 208f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "st1 { v24.s }[2], [x21]\n"
+ "st1 { v28.s }[2], [x20]\n"
+ "b 208f\n"
+ "207:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
+ "str s24, [x21, #0x0]\n"
+ "str s28, [x20, #0x0]\n"
+ "208:" // Height 6: Partial direct writeback: Done
+ "b 210f\n"
+ "209:" // Height 6: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q25, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
+ "str q28, [x20, #0x0]\n"
+ "str q29, [x20, #0x10]\n"
+ "str q30, [x20, #0x20]\n"
+ "str q31, [x20, #0x30]\n"
+ "210:" // Height 6: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 177b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 218f\n"
+ "beq 212f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 217f\n"
+ "tbz %x[flags], #3, 211f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "217:" // Update direct input
+ "211:" // Update direct input
"mov x19, #0xc\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "218:" // Exit
+ "212:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
index ca2696bebd..674d71d626 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -37,9 +37,9 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void a64_hybrid_fp16_mla_6x32( ARGLIST );
+void a64_hybrid_fp16_mla_6x32_a55( ARGLIST );
class cls_a64_hybrid_fp16_mla_6x32
{
@@ -72,10 +72,11 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 32, 1> transforms = {};
- static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+ {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
- return { 5.22 };
+ return { 6.94 };
default:
return { 14.53 };
}
@@ -83,9 +84,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_fp16_mla_6x32;
-
- cls_a64_hybrid_fp16_mla_6x32(const CPUInfo *)
+ cls_a64_hybrid_fp16_mla_6x32(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A55r1:
+ kernel=a64_hybrid_fp16_mla_6x32_a55;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
new file mode 100644
index 0000000000..87c73740e7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
@@ -0,0 +1,5757 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp16_mla_6x32_a55 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+ size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+ const __fp16 *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const __fp16 *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<__fp16>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ ".arch armv8.2-a+fp16\n"
+#endif
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 246f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 197f\n"
+ "beq 148f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 99f\n"
+ "beq 50f\n"
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x16, %x[bias]\n"
+ "mov x15, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "cbz x16, 3f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "b 22f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 21f\n"
+ "cmp x8, #0x20\n"
+ "bge 20f\n"
+ "tbz x8, #4, 11f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v9.8h }, [x15], #0x10\n"
+ "tbz x8, #3, 7f\n"
+ "ld1 { v10.8h }, [x15], #0x10\n"
+ "tbz x8, #2, 5f\n"
+ "ldr d11, [x15], #0x8\n"
+ "tbz x8, #1, 4f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x15], #0x4\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v11.h }[6], [x15]\n"
+ "b 19f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v11.h }[4], [x15]\n"
+ "b 19f\n"
+ "5:" // Height 1: Partial accumulate: partial_2_24
+ "tbz x8, #1, 6f\n"
+ "ldr s11, [x15], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v11.h }[2], [x15]\n"
+ "b 19f\n"
+ "6:" // Height 1: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 19f\n"
+ "ldr h11, [x15, #0x0]\n"
+ "b 19f\n"
+ "7:" // Height 1: Partial accumulate: partial_4_16
+ "tbz x8, #2, 9f\n"
+ "ldr d10, [x15], #0x8\n"
+ "tbz x8, #1, 8f\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v10.s }[2], [x15], #0x4\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v10.h }[6], [x15]\n"
+ "b 19f\n"
+ "8:" // Height 1: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v10.h }[4], [x15]\n"
+ "b 19f\n"
+ "9:" // Height 1: Partial accumulate: partial_2_16
+ "tbz x8, #1, 10f\n"
+ "ldr s10, [x15], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v10.h }[2], [x15]\n"
+ "b 19f\n"
+ "10:" // Height 1: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 19f\n"
+ "ldr h10, [x15, #0x0]\n"
+ "b 19f\n"
+ "11:" // Height 1: Partial accumulate: partial_8_0
+ "tbz x8, #3, 15f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "tbz x8, #2, 13f\n"
+ "ldr d9, [x15], #0x8\n"
+ "tbz x8, #1, 12f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x15], #0x4\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v9.h }[6], [x15]\n"
+ "b 19f\n"
+ "12:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v9.h }[4], [x15]\n"
+ "b 19f\n"
+ "13:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x8, #1, 14f\n"
+ "ldr s9, [x15], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v9.h }[2], [x15]\n"
+ "b 19f\n"
+ "14:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 19f\n"
+ "ldr h9, [x15, #0x0]\n"
+ "b 19f\n"
+ "15:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x8, #2, 17f\n"
+ "ldr d8, [x15], #0x8\n"
+ "tbz x8, #1, 16f\n"
+ "mov x19, #0xc\n"
+ "ld1 { v8.s }[2], [x15], #0x4\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v8.h }[6], [x15]\n"
+ "b 19f\n"
+ "16:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v8.h }[4], [x15]\n"
+ "b 19f\n"
+ "17:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x8, #1, 18f\n"
+ "ldr s8, [x15], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v8.h }[2], [x15]\n"
+ "b 19f\n"
+ "18:" // Height 1: Partial accumulate: partial_1_0
+ "ldr h8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "19:" // Height 1: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 22f\n"
+ "20:" // Height 1: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "b 22f\n"
+ "21:" // Height 1: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "22:" // Height 1: setup done
+ "mov x14, #0x0\n"
+ "23:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 24f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "cbnz x14, 25f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #1\n"
+ "b 25f\n"
+ "24:" // Height 1: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "25:" // Height 1: input setup done
+ "cmp x13, #0x8\n"
+ "blt 28f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "cmp x13, #0x10\n"
+ "blt 27f\n"
+ "26:" // Height 1: Multiply loop: Main loop head
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "add x12, x12, #0x10\n"
+ "ldr d6, [x17, #0x20]\n"
+ "sub x13, x13, #0x8\n"
+ "ldr x10, [x17, #0x28]\n"
+ "cmp x13, #0x10\n"
+ "mov v7.d[1], x11\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x58]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "ldr d6, [x17, #0x100]\n"
+ "ldr x10, [x17, #0x108]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x118]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "ldr d6, [x17, #0x120]\n"
+ "ldr x10, [x17, #0x128]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x138]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "ldr d6, [x17, #0x140]\n"
+ "ldr x10, [x17, #0x148]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x158]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "ldr d6, [x17, #0x160]\n"
+ "ldr x10, [x17, #0x168]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x178]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "ldr d6, [x17, #0x180]\n"
+ "ldr x10, [x17, #0x188]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x198]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "ldr d6, [x17, #0x1a0]\n"
+ "ldr x10, [x17, #0x1a8]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x1b8]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "ldr d6, [x17, #0x1c0]\n"
+ "ldr x10, [x17, #0x1c8]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x1d8]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "ldr d6, [x17, #0x1e0]\n"
+ "ldr x10, [x17, #0x1e8]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x1f8]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x1f0]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d0, [x12, #0x0]\n"
+ "mov v0.d[1], x9\n"
+ "bge 26b\n"
+ "27:" // Height 1: Multiply loop: Single iteration only
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "sub x13, x13, #0x8\n"
+ "add x12, x12, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "ldr q6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "ldr q7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "28:" // Height 1: Multiply loop: Main loop skip
+ "cbz x13, 30f\n"
+ "29:" // Height 1: Multiply loop: Odd block loop
+ "ldr h0, [x12], #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "cbnz x13, 29b\n"
+ "30:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 23b\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "tbz %x[flags], #1, 31f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "31:" // Height 1: No activation
+ "cmp x8, #0x20\n"
+ "bge 48f\n"
+ "tbz x8, #4, 39f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v9.8h }, [x15], #0x10\n"
+ "tbz x8, #3, 35f\n"
+ "st1 { v10.8h }, [x15], #0x10\n"
+ "tbz x8, #2, 33f\n"
+ "str d11, [x15], #0x8\n"
+ "tbz x8, #1, 32f\n"
+ "st1 { v11.s }[2], [x15], #0x4\n"
+ "tbz x8, #0, 47f\n"
+ "st1 { v11.h }[6], [x15]\n"
+ "b 47f\n"
+ "32:" // Height 1: Partial direct writeback: partial_1_28
+ "tbz x8, #0, 47f\n"
+ "st1 { v11.h }[4], [x15]\n"
+ "b 47f\n"
+ "33:" // Height 1: Partial direct writeback: partial_2_24
+ "tbz x8, #1, 34f\n"
+ "str s11, [x15], #0x4\n"
+ "tbz x8, #0, 47f\n"
+ "st1 { v11.h }[2], [x15]\n"
+ "b 47f\n"
+ "34:" // Height 1: Partial direct writeback: partial_1_24
+ "tbz x8, #0, 47f\n"
+ "str h11, [x15, #0x0]\n"
+ "b 47f\n"
+ "35:" // Height 1: Partial direct writeback: partial_4_16
+ "tbz x8, #2, 37f\n"
+ "str d10, [x15], #0x8\n"
+ "tbz x8, #1, 36f\n"
+ "st1 { v10.s }[2], [x15], #0x4\n"
+ "tbz x8, #0, 47f\n"
+ "st1 { v10.h }[6], [x15]\n"
+ "b 47f\n"
+ "36:" // Height 1: Partial direct writeback: partial_1_20
+ "tbz x8, #0, 47f\n"
+ "st1 { v10.h }[4], [x15]\n"
+ "b 47f\n"
+ "37:" // Height 1: Partial direct writeback: partial_2_16
+ "tbz x8, #1, 38f\n"
+ "str s10, [x15], #0x4\n"
+ "tbz x8, #0, 47f\n"
+ "st1 { v10.h }[2], [x15]\n"
+ "b 47f\n"
+ "38:" // Height 1: Partial direct writeback: partial_1_16
+ "tbz x8, #0, 47f\n"
+ "str h10, [x15, #0x0]\n"
+ "b 47f\n"
+ "39:" // Height 1: Partial direct writeback: partial_8_0
+ "tbz x8, #3, 43f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "tbz x8, #2, 41f\n"
+ "str d9, [x15], #0x8\n"
+ "tbz x8, #1, 40f\n"
+ "st1 { v9.s }[2], [x15], #0x4\n"
+ "tbz x8, #0, 47f\n"
+ "st1 { v9.h }[6], [x15]\n"
+ "b 47f\n"
+ "40:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 47f\n"
+ "st1 { v9.h }[4], [x15]\n"
+ "b 47f\n"
+ "41:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 42f\n"
+ "str s9, [x15], #0x4\n"
+ "tbz x8, #0, 47f\n"
+ "st1 { v9.h }[2], [x15]\n"
+ "b 47f\n"
+ "42:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 47f\n"
+ "str h9, [x15, #0x0]\n"
+ "b 47f\n"
+ "43:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 45f\n"
+ "str d8, [x15], #0x8\n"
+ "tbz x8, #1, 44f\n"
+ "st1 { v8.s }[2], [x15], #0x4\n"
+ "tbz x8, #0, 47f\n"
+ "st1 { v8.h }[6], [x15]\n"
+ "b 47f\n"
+ "44:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 47f\n"
+ "st1 { v8.h }[4], [x15]\n"
+ "b 47f\n"
+ "45:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 46f\n"
+ "str s8, [x15], #0x4\n"
+ "tbz x8, #0, 47f\n"
+ "st1 { v8.h }[2], [x15]\n"
+ "b 47f\n"
+ "46:" // Height 1: Partial direct writeback: partial_1_0
+ "str h8, [x15, #0x0]\n"
+ "47:" // Height 1: Partial direct writeback: Done
+ "b 49f\n"
+ "48:" // Height 1: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "49:" // Height 1: Writeback done
+ "subs x8, x8, #0x20\n"
+ "bgt 2b\n"
+ "b 296f\n"
+ "50:" // Height 2
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "51:" // Height 2: Column loop
+ "cbz x16, 52f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "mov v13.16b, v9.16b\n"
+ "add x16, x16, #0x40\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "b 71f\n"
+ "52:" // Height 2: no bias
+ "tbz %x[flags], #0, 70f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x20\n"
+ "add x25, x15, x19, LSL #1\n"
+ "bge 69f\n"
+ "tbz x8, #4, 60f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v9.8h }, [x15], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "tbz x8, #3, 56f\n"
+ "ld1 { v10.8h }, [x15], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "tbz x8, #2, 54f\n"
+ "ldr d11, [x15], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "tbz x8, #1, 53f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x15], #0x4\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v11.h }[6], [x15]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "b 68f\n"
+ "53:" // Height 2: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v11.h }[4], [x15]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "b 68f\n"
+ "54:" // Height 2: Partial accumulate: partial_2_24
+ "tbz x8, #1, 55f\n"
+ "ldr s11, [x15], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v11.h }[2], [x15]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "b 68f\n"
+ "55:" // Height 2: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 68f\n"
+ "ldr h11, [x15, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "b 68f\n"
+ "56:" // Height 2: Partial accumulate: partial_4_16
+ "tbz x8, #2, 58f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "tbz x8, #1, 57f\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v10.s }[2], [x15], #0x4\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v10.h }[6], [x15]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "b 68f\n"
+ "57:" // Height 2: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v10.h }[4], [x15]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "b 68f\n"
+ "58:" // Height 2: Partial accumulate: partial_2_16
+ "tbz x8, #1, 59f\n"
+ "ldr s10, [x15], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v10.h }[2], [x15]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "b 68f\n"
+ "59:" // Height 2: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 68f\n"
+ "ldr h10, [x15, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "b 68f\n"
+ "60:" // Height 2: Partial accumulate: partial_8_0
+ "tbz x8, #3, 64f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "tbz x8, #2, 62f\n"
+ "ldr d9, [x15], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "tbz x8, #1, 61f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x15], #0x4\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v9.h }[6], [x15]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "b 68f\n"
+ "61:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v9.h }[4], [x15]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "b 68f\n"
+ "62:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x8, #1, 63f\n"
+ "ldr s9, [x15], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v9.h }[2], [x15]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "b 68f\n"
+ "63:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 68f\n"
+ "ldr h9, [x15, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "b 68f\n"
+ "64:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x8, #2, 66f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "tbz x8, #1, 65f\n"
+ "mov x19, #0xc\n"
+ "ld1 { v8.s }[2], [x15], #0x4\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v8.h }[6], [x15]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "b 68f\n"
+ "65:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v8.h }[4], [x15]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "b 68f\n"
+ "66:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x8, #1, 67f\n"
+ "ldr s8, [x15], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x8, #0, 68f\n"
+ "ld1 { v8.h }[2], [x15]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "b 68f\n"
+ "67:" // Height 2: Partial accumulate: partial_1_0
+ "ldr h8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr h12, [x25, #0x0]\n"
+ "68:" // Height 2: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 71f\n"
+ "69:" // Height 2: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "b 71f\n"
+ "70:" // Height 2: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "71:" // Height 2: setup done
+ "mov x14, #0x0\n"
+ "72:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 73f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x14, 74f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "b 74f\n"
+ "73:" // Height 2: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #1\n"
+ "74:" // Height 2: input setup done
+ "cmp x13, #0x8\n"
+ "blt 77f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x10\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 76f\n"
+ "75:" // Height 2: Multiply loop: Main loop head
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "add x12, x12, #0x10\n"
+ "ldr x10, [x17, #0x28]\n"
+ "add x28, x28, #0x10\n"
+ "mov v7.d[1], x11\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sub x13, x13, #0x8\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "cmp x13, #0x10\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x58]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x78]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "ldr x10, [x17, #0x108]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "ldr d6, [x17, #0x100]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x118]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "ldr d7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "ldr x10, [x17, #0x128]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "ldr d6, [x17, #0x120]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x138]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "ldr d7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "ldr x10, [x17, #0x148]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "ldr d6, [x17, #0x140]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x158]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "ldr d7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "ldr x10, [x17, #0x168]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "ldr d6, [x17, #0x160]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x178]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "ldr d7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "ldr x10, [x17, #0x188]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "ldr d6, [x17, #0x180]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x198]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "ldr d7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "ldr x10, [x17, #0x1a8]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "ldr d6, [x17, #0x1a0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x1b8]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "ldr d7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "ldr x10, [x17, #0x1c8]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "ldr d6, [x17, #0x1c0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x1d8]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "ldr d7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "ldr x10, [x17, #0x1e8]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "ldr d6, [x17, #0x1e0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x1f8]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "ldr d7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d1, [x28, #0x0]\n"
+ "mov v0.d[1], x9\n"
+ "mov v1.d[1], x27\n"
+ "bge 75b\n"
+ "76:" // Height 2: Multiply loop: Single iteration only
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "sub x13, x13, #0x8\n"
+ "add x12, x12, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "ldr q6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "ldr q7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "77:" // Height 2: Multiply loop: Main loop skip
+ "cbz x13, 79f\n"
+ "78:" // Height 2: Multiply loop: Odd block loop
+ "ldr h0, [x12], #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "cbnz x13, 78b\n"
+ "79:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 72b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #1\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 80f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "80:" // Height 2: No activation
+ "cmp x8, #0x20\n"
+ "bge 97f\n"
+ "tbz x8, #4, 88f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v9.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "tbz x8, #3, 84f\n"
+ "st1 { v10.8h }, [x15], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "tbz x8, #2, 82f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "tbz x8, #1, 81f\n"
+ "st1 { v11.s }[2], [x15], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "tbz x8, #0, 96f\n"
+ "st1 { v11.h }[6], [x15]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "b 96f\n"
+ "81:" // Height 2: Partial direct writeback: partial_1_28
+ "tbz x8, #0, 96f\n"
+ "st1 { v11.h }[4], [x15]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "b 96f\n"
+ "82:" // Height 2: Partial direct writeback: partial_2_24
+ "tbz x8, #1, 83f\n"
+ "str s11, [x15], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "tbz x8, #0, 96f\n"
+ "st1 { v11.h }[2], [x15]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "b 96f\n"
+ "83:" // Height 2: Partial direct writeback: partial_1_24
+ "tbz x8, #0, 96f\n"
+ "str h11, [x15, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "b 96f\n"
+ "84:" // Height 2: Partial direct writeback: partial_4_16
+ "tbz x8, #2, 86f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "tbz x8, #1, 85f\n"
+ "st1 { v10.s }[2], [x15], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "tbz x8, #0, 96f\n"
+ "st1 { v10.h }[6], [x15]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "b 96f\n"
+ "85:" // Height 2: Partial direct writeback: partial_1_20
+ "tbz x8, #0, 96f\n"
+ "st1 { v10.h }[4], [x15]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "b 96f\n"
+ "86:" // Height 2: Partial direct writeback: partial_2_16
+ "tbz x8, #1, 87f\n"
+ "str s10, [x15], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "tbz x8, #0, 96f\n"
+ "st1 { v10.h }[2], [x15]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "b 96f\n"
+ "87:" // Height 2: Partial direct writeback: partial_1_16
+ "tbz x8, #0, 96f\n"
+ "str h10, [x15, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "b 96f\n"
+ "88:" // Height 2: Partial direct writeback: partial_8_0
+ "tbz x8, #3, 92f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "tbz x8, #2, 90f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "tbz x8, #1, 89f\n"
+ "st1 { v9.s }[2], [x15], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "tbz x8, #0, 96f\n"
+ "st1 { v9.h }[6], [x15]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "b 96f\n"
+ "89:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 96f\n"
+ "st1 { v9.h }[4], [x15]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "b 96f\n"
+ "90:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 91f\n"
+ "str s9, [x15], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "tbz x8, #0, 96f\n"
+ "st1 { v9.h }[2], [x15]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "b 96f\n"
+ "91:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 96f\n"
+ "str h9, [x15, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "b 96f\n"
+ "92:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 94f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "tbz x8, #1, 93f\n"
+ "st1 { v8.s }[2], [x15], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "tbz x8, #0, 96f\n"
+ "st1 { v8.h }[6], [x15]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "b 96f\n"
+ "93:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 96f\n"
+ "st1 { v8.h }[4], [x15]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "b 96f\n"
+ "94:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 95f\n"
+ "str s8, [x15], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "tbz x8, #0, 96f\n"
+ "st1 { v8.h }[2], [x15]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "b 96f\n"
+ "95:" // Height 2: Partial direct writeback: partial_1_0
+ "str h8, [x15, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "96:" // Height 2: Partial direct writeback: Done
+ "b 98f\n"
+ "97:" // Height 2: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "98:" // Height 2: Writeback done
+ "subs x8, x8, #0x20\n"
+ "bgt 51b\n"
+ "b 296f\n"
+ "99:" // Height 3
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "100:" // Height 3: Column loop
+ "cbz x16, 101f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "b 120f\n"
+ "101:" // Height 3: no bias
+ "tbz %x[flags], #0, 119f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x20\n"
+ "add x25, x15, x19, LSL #1\n"
+ "add x24, x25, x19, LSL #1\n"
+ "bge 118f\n"
+ "tbz x8, #4, 109f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v9.8h }, [x15], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "tbz x8, #3, 105f\n"
+ "ld1 { v10.8h }, [x15], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "tbz x8, #2, 103f\n"
+ "ldr d11, [x15], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "tbz x8, #1, 102f\n"
+ "ld1 { v11.s }[2], [x15], #0x4\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v11.h }[6], [x15]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "b 117f\n"
+ "102:" // Height 3: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v11.h }[4], [x15]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "b 117f\n"
+ "103:" // Height 3: Partial accumulate: partial_2_24
+ "tbz x8, #1, 104f\n"
+ "ldr s11, [x15], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
+ "mov x19, #0x34\n"
+ "ldr s19, [x24], #0x4\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v11.h }[2], [x15]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "b 117f\n"
+ "104:" // Height 3: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 117f\n"
+ "ldr h11, [x15, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "b 117f\n"
+ "105:" // Height 3: Partial accumulate: partial_4_16
+ "tbz x8, #2, 107f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "tbz x8, #1, 106f\n"
+ "ld1 { v10.s }[2], [x15], #0x4\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v10.h }[6], [x15]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "b 117f\n"
+ "106:" // Height 3: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v10.h }[4], [x15]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "b 117f\n"
+ "107:" // Height 3: Partial accumulate: partial_2_16
+ "tbz x8, #1, 108f\n"
+ "ldr s10, [x15], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "mov x19, #0x24\n"
+ "ldr s18, [x24], #0x4\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v10.h }[2], [x15]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "b 117f\n"
+ "108:" // Height 3: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 117f\n"
+ "ldr h10, [x15, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "b 117f\n"
+ "109:" // Height 3: Partial accumulate: partial_8_0
+ "tbz x8, #3, 113f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "tbz x8, #2, 111f\n"
+ "ldr d9, [x15], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "tbz x8, #1, 110f\n"
+ "ld1 { v9.s }[2], [x15], #0x4\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v9.h }[6], [x15]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "b 117f\n"
+ "110:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v9.h }[4], [x15]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "b 117f\n"
+ "111:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x8, #1, 112f\n"
+ "ldr s9, [x15], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
+ "mov x19, #0x14\n"
+ "ldr s17, [x24], #0x4\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v9.h }[2], [x15]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "b 117f\n"
+ "112:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 117f\n"
+ "ldr h9, [x15, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "b 117f\n"
+ "113:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x8, #2, 115f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "tbz x8, #1, 114f\n"
+ "ld1 { v8.s }[2], [x15], #0x4\n"
+ "mov x19, #0xc\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v8.h }[6], [x15]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "b 117f\n"
+ "114:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v8.h }[4], [x15]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "b 117f\n"
+ "115:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x8, #1, 116f\n"
+ "ldr s8, [x15], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
+ "mov x19, #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "tbz x8, #0, 117f\n"
+ "ld1 { v8.h }[2], [x15]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "b 117f\n"
+ "116:" // Height 3: Partial accumulate: partial_1_0
+ "ldr h8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr h12, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "117:" // Height 3: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 120f\n"
+ "118:" // Height 3: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "b 120f\n"
+ "119:" // Height 3: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "120:" // Height 3: setup done
+ "mov x14, #0x0\n"
+ "121:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 122f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x14, 123f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "b 123f\n"
+ "122:" // Height 3: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "123:" // Height 3: input setup done
+ "cmp x13, #0x8\n"
+ "blt 126f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x10\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 125f\n"
+ "124:" // Height 3: Multiply loop: Main loop head
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "ldr x10, [x17, #0x28]\n"
+ "add x12, x12, #0x10\n"
+ "mov v7.d[1], x11\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x40]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "ldr x11, [x17, #0x58]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x50]\n"
+ "sub x13, x13, #0x8\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x60]\n"
+ "cmp x13, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "ldr x10, [x17, #0x108]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "ldr x11, [x17, #0x118]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "ldr x10, [x17, #0x128]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "ldr x11, [x17, #0x138]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "ldr x10, [x17, #0x148]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "ldr x11, [x17, #0x158]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "ldr x10, [x17, #0x168]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "ldr x11, [x17, #0x178]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "ldr x10, [x17, #0x188]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "ldr x11, [x17, #0x198]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "ldr x10, [x17, #0x1a8]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "ldr x11, [x17, #0x1b8]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "ldr x10, [x17, #0x1c8]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "ldr x11, [x17, #0x1d8]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "ldr x10, [x17, #0x1e8]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "ldr x11, [x17, #0x1f8]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x1f0]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x0]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "ldr d1, [x28, #0x0]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "mov v0.d[1], x9\n"
+ "ldr d2, [x26, #0x0]\n"
+ "mov v1.d[1], x27\n"
+ "mov v2.d[1], x25\n"
+ "bge 124b\n"
+ "125:" // Height 3: Multiply loop: Single iteration only
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x13, x13, #0x8\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "ldr q6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "ldr q7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "126:" // Height 3: Multiply loop: Main loop skip
+ "cbz x13, 128f\n"
+ "127:" // Height 3: Multiply loop: Odd block loop
+ "ldr h0, [x12], #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "cbnz x13, 127b\n"
+ "128:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 121b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #1\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "tbz %x[flags], #1, 129f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "129:" // Height 3: No activation
+ "cmp x8, #0x20\n"
+ "bge 146f\n"
+ "tbz x8, #4, 137f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v9.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "tbz x8, #3, 133f\n"
+ "st1 { v10.8h }, [x15], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "tbz x8, #2, 131f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "tbz x8, #1, 130f\n"
+ "st1 { v11.s }[2], [x15], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "tbz x8, #0, 145f\n"
+ "st1 { v11.h }[6], [x15]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "b 145f\n"
+ "130:" // Height 3: Partial direct writeback: partial_1_28
+ "tbz x8, #0, 145f\n"
+ "st1 { v11.h }[4], [x15]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "b 145f\n"
+ "131:" // Height 3: Partial direct writeback: partial_2_24
+ "tbz x8, #1, 132f\n"
+ "str s11, [x15], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "tbz x8, #0, 145f\n"
+ "st1 { v11.h }[2], [x15]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "b 145f\n"
+ "132:" // Height 3: Partial direct writeback: partial_1_24
+ "tbz x8, #0, 145f\n"
+ "str h11, [x15, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "b 145f\n"
+ "133:" // Height 3: Partial direct writeback: partial_4_16
+ "tbz x8, #2, 135f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "tbz x8, #1, 134f\n"
+ "st1 { v10.s }[2], [x15], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "tbz x8, #0, 145f\n"
+ "st1 { v10.h }[6], [x15]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "b 145f\n"
+ "134:" // Height 3: Partial direct writeback: partial_1_20
+ "tbz x8, #0, 145f\n"
+ "st1 { v10.h }[4], [x15]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "b 145f\n"
+ "135:" // Height 3: Partial direct writeback: partial_2_16
+ "tbz x8, #1, 136f\n"
+ "str s10, [x15], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "tbz x8, #0, 145f\n"
+ "st1 { v10.h }[2], [x15]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "b 145f\n"
+ "136:" // Height 3: Partial direct writeback: partial_1_16
+ "tbz x8, #0, 145f\n"
+ "str h10, [x15, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "b 145f\n"
+ "137:" // Height 3: Partial direct writeback: partial_8_0
+ "tbz x8, #3, 141f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "tbz x8, #2, 139f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "tbz x8, #1, 138f\n"
+ "st1 { v9.s }[2], [x15], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "tbz x8, #0, 145f\n"
+ "st1 { v9.h }[6], [x15]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "b 145f\n"
+ "138:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 145f\n"
+ "st1 { v9.h }[4], [x15]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "b 145f\n"
+ "139:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 140f\n"
+ "str s9, [x15], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "tbz x8, #0, 145f\n"
+ "st1 { v9.h }[2], [x15]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "b 145f\n"
+ "140:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 145f\n"
+ "str h9, [x15, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "b 145f\n"
+ "141:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 143f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "tbz x8, #1, 142f\n"
+ "st1 { v8.s }[2], [x15], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "tbz x8, #0, 145f\n"
+ "st1 { v8.h }[6], [x15]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "b 145f\n"
+ "142:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 145f\n"
+ "st1 { v8.h }[4], [x15]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "b 145f\n"
+ "143:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 144f\n"
+ "str s8, [x15], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "tbz x8, #0, 145f\n"
+ "st1 { v8.h }[2], [x15]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "b 145f\n"
+ "144:" // Height 3: Partial direct writeback: partial_1_0
+ "str h8, [x15, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "145:" // Height 3: Partial direct writeback: Done
+ "b 147f\n"
+ "146:" // Height 3: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "147:" // Height 3: Writeback done
+ "subs x8, x8, #0x20\n"
+ "bgt 100b\n"
+ "b 296f\n"
+ "148:" // Height 4
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "149:" // Height 4: Column loop
+ "cbz x16, 150f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "b 169f\n"
+ "150:" // Height 4: no bias
+ "tbz %x[flags], #0, 168f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x20\n"
+ "add x25, x15, x19, LSL #1\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "bge 167f\n"
+ "tbz x8, #4, 158f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v9.8h }, [x15], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
+ "tbz x8, #3, 154f\n"
+ "ld1 { v10.8h }, [x15], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
+ "tbz x8, #2, 152f\n"
+ "ldr d11, [x15], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "tbz x8, #1, 151f\n"
+ "ld1 { v11.s }[2], [x15], #0x4\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v11.h }[6], [x15]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "b 166f\n"
+ "151:" // Height 4: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v11.h }[4], [x15]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "b 166f\n"
+ "152:" // Height 4: Partial accumulate: partial_2_24
+ "tbz x8, #1, 153f\n"
+ "ldr s11, [x15], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
+ "mov x19, #0x34\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v11.h }[2], [x15]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "b 166f\n"
+ "153:" // Height 4: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 166f\n"
+ "ldr h11, [x15, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "b 166f\n"
+ "154:" // Height 4: Partial accumulate: partial_4_16
+ "tbz x8, #2, 156f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "tbz x8, #1, 155f\n"
+ "ld1 { v10.s }[2], [x15], #0x4\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v10.h }[6], [x15]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "b 166f\n"
+ "155:" // Height 4: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v10.h }[4], [x15]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "b 166f\n"
+ "156:" // Height 4: Partial accumulate: partial_2_16
+ "tbz x8, #1, 157f\n"
+ "ldr s10, [x15], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "mov x19, #0x24\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v10.h }[2], [x15]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "b 166f\n"
+ "157:" // Height 4: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 166f\n"
+ "ldr h10, [x15, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "b 166f\n"
+ "158:" // Height 4: Partial accumulate: partial_8_0
+ "tbz x8, #3, 162f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "tbz x8, #2, 160f\n"
+ "ldr d9, [x15], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "tbz x8, #1, 159f\n"
+ "ld1 { v9.s }[2], [x15], #0x4\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v9.h }[6], [x15]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
+ "b 166f\n"
+ "159:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v9.h }[4], [x15]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
+ "b 166f\n"
+ "160:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x8, #1, 161f\n"
+ "ldr s9, [x15], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
+ "mov x19, #0x14\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v9.h }[2], [x15]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
+ "b 166f\n"
+ "161:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 166f\n"
+ "ldr h9, [x15, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
+ "b 166f\n"
+ "162:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x8, #2, 164f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "tbz x8, #1, 163f\n"
+ "ld1 { v8.s }[2], [x15], #0x4\n"
+ "mov x19, #0xc\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v8.h }[6], [x15]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "b 166f\n"
+ "163:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v8.h }[4], [x15]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "b 166f\n"
+ "164:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x8, #1, 165f\n"
+ "ldr s8, [x15], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
+ "mov x19, #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "tbz x8, #0, 166f\n"
+ "ld1 { v8.h }[2], [x15]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "b 166f\n"
+ "165:" // Height 4: Partial accumulate: partial_1_0
+ "ldr h8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr h12, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "166:" // Height 4: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 169f\n"
+ "167:" // Height 4: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "b 169f\n"
+ "168:" // Height 4: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "169:" // Height 4: setup done
+ "mov x14, #0x0\n"
+ "170:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 171f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x14, 172f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 172f\n"
+ "171:" // Height 4: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "172:" // Height 4: input setup done
+ "cmp x13, #0x8\n"
+ "blt 175f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x10\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 174f\n"
+ "173:" // Height 4: Multiply loop: Main loop head
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr x10, [x17, #0x28]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x38]\n"
+ "add x12, x12, #0x10\n"
+ "add x28, x28, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "ldr x11, [x17, #0x58]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x70]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "ldr x23, [x24, #0x8]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "sub x13, x13, #0x8\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "cmp x13, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "ldr x10, [x17, #0x108]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "ldr d6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "ldr x11, [x17, #0x118]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "ldr d7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "ldr x10, [x17, #0x128]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "ldr d6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "ldr x11, [x17, #0x138]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "ldr d7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "ldr x10, [x17, #0x148]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "ldr d6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "ldr x11, [x17, #0x158]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "ldr d7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "ldr x10, [x17, #0x168]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "ldr d6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "ldr x11, [x17, #0x178]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "ldr d7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "ldr x10, [x17, #0x188]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "ldr d6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "ldr x11, [x17, #0x198]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "ldr d7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "ldr x10, [x17, #0x1a8]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "ldr d6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "ldr x11, [x17, #0x1b8]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "ldr d7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "ldr x10, [x17, #0x1c8]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "ldr d6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "ldr x11, [x17, #0x1d8]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "ldr d7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "ldr x10, [x17, #0x1e8]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "ldr d6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "ldr x11, [x17, #0x1f8]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "ldr d7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "ldr d1, [x28, #0x0]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "mov v0.d[1], x9\n"
+ "mov v1.d[1], x27\n"
+ "ldr d2, [x26, #0x0]\n"
+ "ldr d3, [x24, #0x0]\n"
+ "mov v2.d[1], x25\n"
+ "mov v3.d[1], x23\n"
+ "bge 173b\n"
+ "174:" // Height 4: Multiply loop: Single iteration only
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x13, x13, #0x8\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "ldr q6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "ldr q7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "175:" // Height 4: Multiply loop: Main loop skip
+ "cbz x13, 177f\n"
+ "176:" // Height 4: Multiply loop: Odd block loop
+ "ldr h0, [x12], #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "cbnz x13, 176b\n"
+ "177:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 170b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #1\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #1\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 178f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "fmax v20.8h, v20.8h, v1.8h\n"
+ "fmax v21.8h, v21.8h, v1.8h\n"
+ "fmax v22.8h, v22.8h, v1.8h\n"
+ "fmax v23.8h, v23.8h, v1.8h\n"
+ "178:" // Height 4: No activation
+ "cmp x8, #0x20\n"
+ "bge 195f\n"
+ "tbz x8, #4, 186f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v9.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
+ "tbz x8, #3, 182f\n"
+ "st1 { v10.8h }, [x15], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
+ "tbz x8, #2, 180f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "tbz x8, #1, 179f\n"
+ "st1 { v11.s }[2], [x15], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
+ "tbz x8, #0, 194f\n"
+ "st1 { v11.h }[6], [x15]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "b 194f\n"
+ "179:" // Height 4: Partial direct writeback: partial_1_28
+ "tbz x8, #0, 194f\n"
+ "st1 { v11.h }[4], [x15]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "b 194f\n"
+ "180:" // Height 4: Partial direct writeback: partial_2_24
+ "tbz x8, #1, 181f\n"
+ "str s11, [x15], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
+ "tbz x8, #0, 194f\n"
+ "st1 { v11.h }[2], [x15]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "b 194f\n"
+ "181:" // Height 4: Partial direct writeback: partial_1_24
+ "tbz x8, #0, 194f\n"
+ "str h11, [x15, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
+ "b 194f\n"
+ "182:" // Height 4: Partial direct writeback: partial_4_16
+ "tbz x8, #2, 184f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "tbz x8, #1, 183f\n"
+ "st1 { v10.s }[2], [x15], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
+ "tbz x8, #0, 194f\n"
+ "st1 { v10.h }[6], [x15]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
+ "b 194f\n"
+ "183:" // Height 4: Partial direct writeback: partial_1_20
+ "tbz x8, #0, 194f\n"
+ "st1 { v10.h }[4], [x15]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
+ "b 194f\n"
+ "184:" // Height 4: Partial direct writeback: partial_2_16
+ "tbz x8, #1, 185f\n"
+ "str s10, [x15], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
+ "tbz x8, #0, 194f\n"
+ "st1 { v10.h }[2], [x15]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
+ "b 194f\n"
+ "185:" // Height 4: Partial direct writeback: partial_1_16
+ "tbz x8, #0, 194f\n"
+ "str h10, [x15, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
+ "b 194f\n"
+ "186:" // Height 4: Partial direct writeback: partial_8_0
+ "tbz x8, #3, 190f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "tbz x8, #2, 188f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "tbz x8, #1, 187f\n"
+ "st1 { v9.s }[2], [x15], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
+ "tbz x8, #0, 194f\n"
+ "st1 { v9.h }[6], [x15]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
+ "b 194f\n"
+ "187:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 194f\n"
+ "st1 { v9.h }[4], [x15]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
+ "b 194f\n"
+ "188:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 189f\n"
+ "str s9, [x15], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
+ "tbz x8, #0, 194f\n"
+ "st1 { v9.h }[2], [x15]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
+ "b 194f\n"
+ "189:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 194f\n"
+ "str h9, [x15, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
+ "b 194f\n"
+ "190:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 192f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "tbz x8, #1, 191f\n"
+ "st1 { v8.s }[2], [x15], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "tbz x8, #0, 194f\n"
+ "st1 { v8.h }[6], [x15]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "b 194f\n"
+ "191:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 194f\n"
+ "st1 { v8.h }[4], [x15]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "b 194f\n"
+ "192:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 193f\n"
+ "str s8, [x15], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "tbz x8, #0, 194f\n"
+ "st1 { v8.h }[2], [x15]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "b 194f\n"
+ "193:" // Height 4: Partial direct writeback: partial_1_0
+ "str h8, [x15, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
+ "194:" // Height 4: Partial direct writeback: Done
+ "b 196f\n"
+ "195:" // Height 4: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "196:" // Height 4: Writeback done
+ "subs x8, x8, #0x20\n"
+ "bgt 149b\n"
+ "b 296f\n"
+ "197:" // Height 5
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "198:" // Height 5: Column loop
+ "cbz x16, 199f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v24.16b, v8.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "b 218f\n"
+ "199:" // Height 5: no bias
+ "tbz %x[flags], #0, 217f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x20\n"
+ "add x25, x15, x19, LSL #1\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "bge 216f\n"
+ "tbz x8, #4, 207f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v9.8h }, [x15], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
+ "ld1 { v25.8h }, [x22], #0x10\n"
+ "tbz x8, #3, 203f\n"
+ "ld1 { v10.8h }, [x15], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
+ "ld1 { v26.8h }, [x22], #0x10\n"
+ "tbz x8, #2, 201f\n"
+ "ldr d11, [x15], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "tbz x8, #1, 200f\n"
+ "ld1 { v11.s }[2], [x15], #0x4\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v11.h }[6], [x15]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "ld1 { v27.h }[6], [x22]\n"
+ "b 215f\n"
+ "200:" // Height 5: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v11.h }[4], [x15]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "ld1 { v27.h }[4], [x22]\n"
+ "b 215f\n"
+ "201:" // Height 5: Partial accumulate: partial_2_24
+ "tbz x8, #1, 202f\n"
+ "ldr s11, [x15], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
+ "mov x19, #0x34\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v11.h }[2], [x15]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v27.h }[2], [x22]\n"
+ "b 215f\n"
+ "202:" // Height 5: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 215f\n"
+ "ldr h11, [x15, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h27, [x22, #0x0]\n"
+ "b 215f\n"
+ "203:" // Height 5: Partial accumulate: partial_4_16
+ "tbz x8, #2, 205f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "tbz x8, #1, 204f\n"
+ "ld1 { v10.s }[2], [x15], #0x4\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v10.h }[6], [x15]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v26.h }[6], [x22]\n"
+ "b 215f\n"
+ "204:" // Height 5: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v10.h }[4], [x15]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v26.h }[4], [x22]\n"
+ "b 215f\n"
+ "205:" // Height 5: Partial accumulate: partial_2_16
+ "tbz x8, #1, 206f\n"
+ "ldr s10, [x15], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "mov x19, #0x24\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v10.h }[2], [x15]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
+ "b 215f\n"
+ "206:" // Height 5: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 215f\n"
+ "ldr h10, [x15, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
+ "b 215f\n"
+ "207:" // Height 5: Partial accumulate: partial_8_0
+ "tbz x8, #3, 211f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "tbz x8, #2, 209f\n"
+ "ldr d9, [x15], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "tbz x8, #1, 208f\n"
+ "ld1 { v9.s }[2], [x15], #0x4\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v9.h }[6], [x15]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
+ "b 215f\n"
+ "208:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v9.h }[4], [x15]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
+ "b 215f\n"
+ "209:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x8, #1, 210f\n"
+ "ldr s9, [x15], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
+ "mov x19, #0x14\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v9.h }[2], [x15]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
+ "b 215f\n"
+ "210:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 215f\n"
+ "ldr h9, [x15, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
+ "b 215f\n"
+ "211:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x8, #2, 213f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "tbz x8, #1, 212f\n"
+ "ld1 { v8.s }[2], [x15], #0x4\n"
+ "mov x19, #0xc\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v8.h }[6], [x15]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
+ "b 215f\n"
+ "212:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v8.h }[4], [x15]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
+ "b 215f\n"
+ "213:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x8, #1, 214f\n"
+ "ldr s8, [x15], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
+ "mov x19, #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "tbz x8, #0, 215f\n"
+ "ld1 { v8.h }[2], [x15]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
+ "b 215f\n"
+ "214:" // Height 5: Partial accumulate: partial_1_0
+ "ldr h8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr h12, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
+ "215:" // Height 5: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 218f\n"
+ "216:" // Height 5: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "b 218f\n"
+ "217:" // Height 5: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "218:" // Height 5: setup done
+ "mov x14, #0x0\n"
+ "219:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 220f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x14, 221f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 221f\n"
+ "220:" // Height 5: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "221:" // Height 5: input setup done
+ "cmp x13, #0x8\n"
+ "blt 224f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x10\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 223f\n"
+ "222:" // Height 5: Multiply loop: Main loop head
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr x10, [x17, #0x28]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "mov v7.d[1], x11\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x30]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "ldr x11, [x17, #0x58]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "ldr x23, [x24, #0x8]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "ldr x21, [x22, #0x8]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "sub x13, x13, #0x8\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "cmp x13, #0x10\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "ldr x10, [x17, #0x108]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "ldr x11, [x17, #0x118]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "ldr d6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "ldr x10, [x17, #0x128]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "ldr d7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "ldr x11, [x17, #0x138]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "ldr d6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "ldr x10, [x17, #0x148]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "ldr d7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "ldr x11, [x17, #0x158]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "ldr d6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "ldr x10, [x17, #0x168]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "ldr d7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "ldr x11, [x17, #0x178]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "ldr d6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "ldr x10, [x17, #0x188]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "ldr d7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "ldr x11, [x17, #0x198]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "ldr d6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "ldr x10, [x17, #0x1a8]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "ldr d7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "ldr x11, [x17, #0x1b8]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "ldr d6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "ldr x10, [x17, #0x1c8]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "ldr d7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "ldr x11, [x17, #0x1d8]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "ldr d6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "ldr x10, [x17, #0x1e8]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "ldr d7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "ldr x11, [x17, #0x1f8]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "ldr d6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "ldr d7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "ldr d1, [x28, #0x0]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "mov v0.d[1], x9\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "mov v1.d[1], x27\n"
+ "ldr d2, [x26, #0x0]\n"
+ "ldr d3, [x24, #0x0]\n"
+ "ldr d4, [x22, #0x0]\n"
+ "mov v2.d[1], x25\n"
+ "mov v3.d[1], x23\n"
+ "mov v4.d[1], x21\n"
+ "bge 222b\n"
+ "223:" // Height 5: Multiply loop: Single iteration only
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x13, x13, #0x8\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "ldr q6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "ldr q7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "224:" // Height 5: Multiply loop: Main loop skip
+ "cbz x13, 226f\n"
+ "225:" // Height 5: Multiply loop: Odd block loop
+ "ldr h0, [x12], #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "cbnz x13, 225b\n"
+ "226:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 219b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #1\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #1\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #1\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "tbz %x[flags], #1, 227f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmin v24.8h, v24.8h, v0.8h\n"
+ "fmin v25.8h, v25.8h, v0.8h\n"
+ "fmin v26.8h, v26.8h, v0.8h\n"
+ "fmin v27.8h, v27.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "fmax v20.8h, v20.8h, v1.8h\n"
+ "fmax v21.8h, v21.8h, v1.8h\n"
+ "fmax v22.8h, v22.8h, v1.8h\n"
+ "fmax v23.8h, v23.8h, v1.8h\n"
+ "fmax v24.8h, v24.8h, v1.8h\n"
+ "fmax v25.8h, v25.8h, v1.8h\n"
+ "fmax v26.8h, v26.8h, v1.8h\n"
+ "fmax v27.8h, v27.8h, v1.8h\n"
+ "227:" // Height 5: No activation
+ "cmp x8, #0x20\n"
+ "bge 244f\n"
+ "tbz x8, #4, 235f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v9.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v25.8h }, [x22], #0x10\n"
+ "tbz x8, #3, 231f\n"
+ "st1 { v10.8h }, [x15], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
+ "st1 { v26.8h }, [x22], #0x10\n"
+ "tbz x8, #2, 229f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "tbz x8, #1, 228f\n"
+ "st1 { v11.s }[2], [x15], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
+ "st1 { v27.s }[2], [x22], #0x4\n"
+ "tbz x8, #0, 243f\n"
+ "st1 { v11.h }[6], [x15]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "st1 { v27.h }[6], [x22]\n"
+ "b 243f\n"
+ "228:" // Height 5: Partial direct writeback: partial_1_28
+ "tbz x8, #0, 243f\n"
+ "st1 { v11.h }[4], [x15]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "st1 { v27.h }[4], [x22]\n"
+ "b 243f\n"
+ "229:" // Height 5: Partial direct writeback: partial_2_24
+ "tbz x8, #1, 230f\n"
+ "str s11, [x15], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
+ "str s27, [x22], #0x4\n"
+ "tbz x8, #0, 243f\n"
+ "st1 { v11.h }[2], [x15]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "st1 { v27.h }[2], [x22]\n"
+ "b 243f\n"
+ "230:" // Height 5: Partial direct writeback: partial_1_24
+ "tbz x8, #0, 243f\n"
+ "str h11, [x15, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
+ "str h27, [x22, #0x0]\n"
+ "b 243f\n"
+ "231:" // Height 5: Partial direct writeback: partial_4_16
+ "tbz x8, #2, 233f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "tbz x8, #1, 232f\n"
+ "st1 { v10.s }[2], [x15], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
+ "st1 { v26.s }[2], [x22], #0x4\n"
+ "tbz x8, #0, 243f\n"
+ "st1 { v10.h }[6], [x15]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
+ "st1 { v26.h }[6], [x22]\n"
+ "b 243f\n"
+ "232:" // Height 5: Partial direct writeback: partial_1_20
+ "tbz x8, #0, 243f\n"
+ "st1 { v10.h }[4], [x15]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
+ "st1 { v26.h }[4], [x22]\n"
+ "b 243f\n"
+ "233:" // Height 5: Partial direct writeback: partial_2_16
+ "tbz x8, #1, 234f\n"
+ "str s10, [x15], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
+ "str s26, [x22], #0x4\n"
+ "tbz x8, #0, 243f\n"
+ "st1 { v10.h }[2], [x15]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
+ "st1 { v26.h }[2], [x22]\n"
+ "b 243f\n"
+ "234:" // Height 5: Partial direct writeback: partial_1_16
+ "tbz x8, #0, 243f\n"
+ "str h10, [x15, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
+ "str h26, [x22, #0x0]\n"
+ "b 243f\n"
+ "235:" // Height 5: Partial direct writeback: partial_8_0
+ "tbz x8, #3, 239f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "tbz x8, #2, 237f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "tbz x8, #1, 236f\n"
+ "st1 { v9.s }[2], [x15], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
+ "st1 { v25.s }[2], [x22], #0x4\n"
+ "tbz x8, #0, 243f\n"
+ "st1 { v9.h }[6], [x15]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "b 243f\n"
+ "236:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 243f\n"
+ "st1 { v9.h }[4], [x15]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "b 243f\n"
+ "237:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 238f\n"
+ "str s9, [x15], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
+ "str s25, [x22], #0x4\n"
+ "tbz x8, #0, 243f\n"
+ "st1 { v9.h }[2], [x15]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "b 243f\n"
+ "238:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 243f\n"
+ "str h9, [x15, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
+ "str h25, [x22, #0x0]\n"
+ "b 243f\n"
+ "239:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 241f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "tbz x8, #1, 240f\n"
+ "st1 { v8.s }[2], [x15], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "tbz x8, #0, 243f\n"
+ "st1 { v8.h }[6], [x15]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "st1 { v24.h }[6], [x22]\n"
+ "b 243f\n"
+ "240:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 243f\n"
+ "st1 { v8.h }[4], [x15]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "st1 { v24.h }[4], [x22]\n"
+ "b 243f\n"
+ "241:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 242f\n"
+ "str s8, [x15], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "tbz x8, #0, 243f\n"
+ "st1 { v8.h }[2], [x15]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "st1 { v24.h }[2], [x22]\n"
+ "b 243f\n"
+ "242:" // Height 5: Partial direct writeback: partial_1_0
+ "str h8, [x15, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
+ "str h24, [x22, #0x0]\n"
+ "243:" // Height 5: Partial direct writeback: Done
+ "b 245f\n"
+ "244:" // Height 5: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "245:" // Height 5: Writeback done
+ "subs x8, x8, #0x20\n"
+ "bgt 198b\n"
+ "b 296f\n"
+ "246:" // Height 6
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0xc\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "247:" // Height 6: Column loop
+ "cbz x16, 248f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v24.16b, v8.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v28.16b, v8.16b\n"
+ "mov v29.16b, v9.16b\n"
+ "mov v30.16b, v10.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "mov v31.16b, v11.16b\n"
+ "b 267f\n"
+ "248:" // Height 6: no bias
+ "tbz %x[flags], #0, 266f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x20\n"
+ "add x25, x15, x19, LSL #1\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "bge 265f\n"
+ "tbz x8, #4, 256f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v9.8h }, [x15], #0x10\n"
+ "ld1 { v13.8h }, [x25], #0x10\n"
+ "ld1 { v17.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
+ "ld1 { v21.8h }, [x23], #0x10\n"
+ "ld1 { v25.8h }, [x22], #0x10\n"
+ "ld1 { v29.8h }, [x21], #0x10\n"
+ "tbz x8, #3, 252f\n"
+ "ld1 { v10.8h }, [x15], #0x10\n"
+ "ld1 { v14.8h }, [x25], #0x10\n"
+ "ld1 { v18.8h }, [x24], #0x10\n"
+ "ld1 { v22.8h }, [x23], #0x10\n"
+ "ld1 { v26.8h }, [x22], #0x10\n"
+ "ld1 { v30.8h }, [x21], #0x10\n"
+ "tbz x8, #2, 250f\n"
+ "ldr d11, [x15], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x8, #1, 249f\n"
+ "ld1 { v11.s }[2], [x15], #0x4\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v15.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v23.s }[2], [x23], #0x4\n"
+ "ld1 { v27.s }[2], [x22], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v11.h }[6], [x15]\n"
+ "ld1 { v15.h }[6], [x25]\n"
+ "ld1 { v19.h }[6], [x24]\n"
+ "ld1 { v23.h }[6], [x23]\n"
+ "ld1 { v27.h }[6], [x22]\n"
+ "ld1 { v31.h }[6], [x21]\n"
+ "b 264f\n"
+ "249:" // Height 6: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v11.h }[4], [x15]\n"
+ "ld1 { v15.h }[4], [x25]\n"
+ "ld1 { v19.h }[4], [x24]\n"
+ "ld1 { v23.h }[4], [x23]\n"
+ "ld1 { v27.h }[4], [x22]\n"
+ "ld1 { v31.h }[4], [x21]\n"
+ "b 264f\n"
+ "250:" // Height 6: Partial accumulate: partial_2_24
+ "tbz x8, #1, 251f\n"
+ "ldr s11, [x15], #0x4\n"
+ "ldr s15, [x25], #0x4\n"
+ "mov x19, #0x34\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s27, [x22], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v11.h }[2], [x15]\n"
+ "ld1 { v15.h }[2], [x25]\n"
+ "ld1 { v19.h }[2], [x24]\n"
+ "ld1 { v23.h }[2], [x23]\n"
+ "ld1 { v27.h }[2], [x22]\n"
+ "ld1 { v31.h }[2], [x21]\n"
+ "b 264f\n"
+ "251:" // Height 6: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 264f\n"
+ "ldr h11, [x15, #0x0]\n"
+ "ldr h15, [x25, #0x0]\n"
+ "ldr h19, [x24, #0x0]\n"
+ "ldr h23, [x23, #0x0]\n"
+ "ldr h27, [x22, #0x0]\n"
+ "ldr h31, [x21, #0x0]\n"
+ "b 264f\n"
+ "252:" // Height 6: Partial accumulate: partial_4_16
+ "tbz x8, #2, 254f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "tbz x8, #1, 253f\n"
+ "ld1 { v10.s }[2], [x15], #0x4\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v14.s }[2], [x25], #0x4\n"
+ "ld1 { v18.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v26.s }[2], [x22], #0x4\n"
+ "ld1 { v30.s }[2], [x21], #0x4\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v10.h }[6], [x15]\n"
+ "ld1 { v14.h }[6], [x25]\n"
+ "ld1 { v18.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v26.h }[6], [x22]\n"
+ "ld1 { v30.h }[6], [x21]\n"
+ "b 264f\n"
+ "253:" // Height 6: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v10.h }[4], [x15]\n"
+ "ld1 { v14.h }[4], [x25]\n"
+ "ld1 { v18.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v26.h }[4], [x22]\n"
+ "ld1 { v30.h }[4], [x21]\n"
+ "b 264f\n"
+ "254:" // Height 6: Partial accumulate: partial_2_16
+ "tbz x8, #1, 255f\n"
+ "ldr s10, [x15], #0x4\n"
+ "ldr s14, [x25], #0x4\n"
+ "mov x19, #0x24\n"
+ "ldr s18, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s26, [x22], #0x4\n"
+ "ldr s30, [x21], #0x4\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v10.h }[2], [x15]\n"
+ "ld1 { v14.h }[2], [x25]\n"
+ "ld1 { v18.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v26.h }[2], [x22]\n"
+ "ld1 { v30.h }[2], [x21]\n"
+ "b 264f\n"
+ "255:" // Height 6: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 264f\n"
+ "ldr h10, [x15, #0x0]\n"
+ "ldr h14, [x25, #0x0]\n"
+ "ldr h18, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "ldr h26, [x22, #0x0]\n"
+ "ldr h30, [x21, #0x0]\n"
+ "b 264f\n"
+ "256:" // Height 6: Partial accumulate: partial_8_0
+ "tbz x8, #3, 260f\n"
+ "ld1 { v8.8h }, [x15], #0x10\n"
+ "ld1 { v12.8h }, [x25], #0x10\n"
+ "ld1 { v16.8h }, [x24], #0x10\n"
+ "ld1 { v20.8h }, [x23], #0x10\n"
+ "ld1 { v24.8h }, [x22], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
+ "tbz x8, #2, 258f\n"
+ "ldr d9, [x15], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x8, #1, 257f\n"
+ "ld1 { v9.s }[2], [x15], #0x4\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v13.s }[2], [x25], #0x4\n"
+ "ld1 { v17.s }[2], [x24], #0x4\n"
+ "ld1 { v21.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v29.s }[2], [x21], #0x4\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v9.h }[6], [x15]\n"
+ "ld1 { v13.h }[6], [x25]\n"
+ "ld1 { v17.h }[6], [x24]\n"
+ "ld1 { v21.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
+ "ld1 { v29.h }[6], [x21]\n"
+ "b 264f\n"
+ "257:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v9.h }[4], [x15]\n"
+ "ld1 { v13.h }[4], [x25]\n"
+ "ld1 { v17.h }[4], [x24]\n"
+ "ld1 { v21.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
+ "ld1 { v29.h }[4], [x21]\n"
+ "b 264f\n"
+ "258:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x8, #1, 259f\n"
+ "ldr s9, [x15], #0x4\n"
+ "ldr s13, [x25], #0x4\n"
+ "mov x19, #0x14\n"
+ "ldr s17, [x24], #0x4\n"
+ "ldr s21, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s29, [x21], #0x4\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v9.h }[2], [x15]\n"
+ "ld1 { v13.h }[2], [x25]\n"
+ "ld1 { v17.h }[2], [x24]\n"
+ "ld1 { v21.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
+ "ld1 { v29.h }[2], [x21]\n"
+ "b 264f\n"
+ "259:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 264f\n"
+ "ldr h9, [x15, #0x0]\n"
+ "ldr h13, [x25, #0x0]\n"
+ "ldr h17, [x24, #0x0]\n"
+ "ldr h21, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
+ "ldr h29, [x21, #0x0]\n"
+ "b 264f\n"
+ "260:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x8, #2, 262f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "tbz x8, #1, 261f\n"
+ "ld1 { v8.s }[2], [x15], #0x4\n"
+ "mov x19, #0xc\n"
+ "ld1 { v12.s }[2], [x25], #0x4\n"
+ "ld1 { v16.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v8.h }[6], [x15]\n"
+ "ld1 { v12.h }[6], [x25]\n"
+ "ld1 { v16.h }[6], [x24]\n"
+ "ld1 { v20.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
+ "ld1 { v28.h }[6], [x21]\n"
+ "b 264f\n"
+ "261:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v8.h }[4], [x15]\n"
+ "ld1 { v12.h }[4], [x25]\n"
+ "ld1 { v16.h }[4], [x24]\n"
+ "ld1 { v20.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
+ "ld1 { v28.h }[4], [x21]\n"
+ "b 264f\n"
+ "262:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x8, #1, 263f\n"
+ "ldr s8, [x15], #0x4\n"
+ "ldr s12, [x25], #0x4\n"
+ "mov x19, #0x4\n"
+ "ldr s16, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "tbz x8, #0, 264f\n"
+ "ld1 { v8.h }[2], [x15]\n"
+ "ld1 { v12.h }[2], [x25]\n"
+ "ld1 { v16.h }[2], [x24]\n"
+ "ld1 { v20.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
+ "ld1 { v28.h }[2], [x21]\n"
+ "b 264f\n"
+ "263:" // Height 6: Partial accumulate: partial_1_0
+ "ldr h8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr h12, [x25, #0x0]\n"
+ "ldr h16, [x24, #0x0]\n"
+ "ldr h20, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
+ "ldr h28, [x21, #0x0]\n"
+ "264:" // Height 6: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 267f\n"
+ "265:" // Height 6: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 267f\n"
+ "266:" // Height 6: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "267:" // Height 6: setup done
+ "mov x14, #0x0\n"
+ "268:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 269f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x14, 270f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "add x20, x20, x19, LSL #1\n"
+ "b 270f\n"
+ "269:" // Height 6: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "add x20, x22, x19, LSL #1\n"
+ "270:" // Height 6: input setup done
+ "cmp x13, #0x8\n"
+ "blt 273f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x10\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 272f\n"
+ "271:" // Height 6: Multiply loop: Main loop head
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr x10, [x17, #0x28]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "fmla v29.8h, v7.8h, v5.h[0]\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "ldr x11, [x17, #0x58]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v30.8h, v6.8h, v5.h[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "ldr x23, [x24, #0x8]\n"
+ "fmla v28.8h, v6.8h, v5.h[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "ldr x21, [x22, #0x8]\n"
+ "fmla v29.8h, v7.8h, v5.h[1]\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "ldr x19, [x20, #0x8]\n"
+ "fmla v30.8h, v6.8h, v5.h[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "sub x13, x13, #0x8\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "cmp x13, #0x10\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "fmla v31.8h, v7.8h, v5.h[1]\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "fmla v28.8h, v6.8h, v5.h[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "fmla v29.8h, v7.8h, v5.h[2]\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "fmla v30.8h, v6.8h, v5.h[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "fmla v31.8h, v7.8h, v5.h[2]\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "fmla v28.8h, v6.8h, v5.h[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "ldr x10, [x17, #0x108]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "fmla v29.8h, v7.8h, v5.h[3]\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "ldr x11, [x17, #0x118]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "fmla v30.8h, v6.8h, v5.h[3]\n"
+ "ldr d6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "ldr x10, [x17, #0x128]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "fmla v31.8h, v7.8h, v5.h[3]\n"
+ "ldr d7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "ldr x11, [x17, #0x138]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "fmla v28.8h, v6.8h, v5.h[4]\n"
+ "ldr d6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "ldr x10, [x17, #0x148]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "fmla v29.8h, v7.8h, v5.h[4]\n"
+ "ldr d7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "ldr x11, [x17, #0x158]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "fmla v30.8h, v6.8h, v5.h[4]\n"
+ "ldr d6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "ldr x10, [x17, #0x168]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "fmla v31.8h, v7.8h, v5.h[4]\n"
+ "ldr d7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "ldr x11, [x17, #0x178]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "fmla v28.8h, v6.8h, v5.h[5]\n"
+ "ldr d6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "ldr x10, [x17, #0x188]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "fmla v29.8h, v7.8h, v5.h[5]\n"
+ "ldr d7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "ldr x11, [x17, #0x198]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "fmla v30.8h, v6.8h, v5.h[5]\n"
+ "ldr d6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "ldr x10, [x17, #0x1a8]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "fmla v31.8h, v7.8h, v5.h[5]\n"
+ "ldr d7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "ldr x11, [x17, #0x1b8]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "fmla v28.8h, v6.8h, v5.h[6]\n"
+ "ldr d6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "ldr x10, [x17, #0x1c8]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "fmla v29.8h, v7.8h, v5.h[6]\n"
+ "ldr d7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "ldr x11, [x17, #0x1d8]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "fmla v30.8h, v6.8h, v5.h[6]\n"
+ "ldr d6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "ldr x10, [x17, #0x1e8]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "fmla v31.8h, v7.8h, v5.h[6]\n"
+ "ldr d7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "ldr x11, [x17, #0x1f8]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "fmla v28.8h, v6.8h, v5.h[7]\n"
+ "ldr d6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "fmla v29.8h, v7.8h, v5.h[7]\n"
+ "ldr d7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v30.8h, v6.8h, v5.h[7]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "ldr d1, [x28, #0x0]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "mov v0.d[1], x9\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "mov v1.d[1], x27\n"
+ "fmla v31.8h, v7.8h, v5.h[7]\n"
+ "ldr d2, [x26, #0x0]\n"
+ "ldr d3, [x24, #0x0]\n"
+ "ldr d4, [x22, #0x0]\n"
+ "mov v2.d[1], x25\n"
+ "ldr d5, [x20, #0x0]\n"
+ "mov v3.d[1], x23\n"
+ "mov v4.d[1], x21\n"
+ "mov v5.d[1], x19\n"
+ "bge 271b\n"
+ "272:" // Height 6: Multiply loop: Single iteration only
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x13, x13, #0x8\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v29.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v30.8h, v6.8h, v5.h[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "fmla v28.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "fmla v29.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "fmla v30.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "fmla v31.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "fmla v28.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "fmla v29.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "fmla v30.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "fmla v31.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "fmla v28.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "fmla v29.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "fmla v30.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x17, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "fmla v31.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x17, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "fmla v28.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x17, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "fmla v29.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x17, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "fmla v30.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x17, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "fmla v31.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x17, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "fmla v28.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x17, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "fmla v29.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x17, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "fmla v30.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x17, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "fmla v31.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x17, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "fmla v28.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x17, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "fmla v29.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x17, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "fmla v30.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x17, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "fmla v31.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x17, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "fmla v28.8h, v6.8h, v5.h[7]\n"
+ "ldr q6, [x17, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "fmla v29.8h, v7.8h, v5.h[7]\n"
+ "ldr q7, [x17, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x17, x17, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v30.8h, v6.8h, v5.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v31.8h, v7.8h, v5.h[7]\n"
+ "273:" // Height 6: Multiply loop: Main loop skip
+ "cbz x13, 275f\n"
+ "274:" // Height 6: Multiply loop: Odd block loop
+ "ldr h0, [x12], #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "fmla v29.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v30.8h, v6.8h, v5.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "cbnz x13, 274b\n"
+ "275:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 268b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #1\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #1\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #1\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #1\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 276f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.8h }, [x20]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmin v24.8h, v24.8h, v0.8h\n"
+ "fmin v25.8h, v25.8h, v0.8h\n"
+ "fmin v26.8h, v26.8h, v0.8h\n"
+ "fmin v27.8h, v27.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "fmax v20.8h, v20.8h, v1.8h\n"
+ "fmax v21.8h, v21.8h, v1.8h\n"
+ "fmax v22.8h, v22.8h, v1.8h\n"
+ "fmax v23.8h, v23.8h, v1.8h\n"
+ "fmax v24.8h, v24.8h, v1.8h\n"
+ "fmax v25.8h, v25.8h, v1.8h\n"
+ "fmax v26.8h, v26.8h, v1.8h\n"
+ "fmax v27.8h, v27.8h, v1.8h\n"
+ "fmin v28.8h, v28.8h, v0.8h\n"
+ "fmin v29.8h, v29.8h, v0.8h\n"
+ "fmin v30.8h, v30.8h, v0.8h\n"
+ "fmin v31.8h, v31.8h, v0.8h\n"
+ "fmax v28.8h, v28.8h, v1.8h\n"
+ "fmax v29.8h, v29.8h, v1.8h\n"
+ "fmax v30.8h, v30.8h, v1.8h\n"
+ "fmax v31.8h, v31.8h, v1.8h\n"
+ "276:" // Height 6: No activation
+ "cmp x8, #0x20\n"
+ "bge 293f\n"
+ "tbz x8, #4, 284f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v9.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v13.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v17.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v21.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v25.8h }, [x22], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
+ "st1 { v29.8h }, [x21], #0x10\n"
+ "tbz x8, #3, 280f\n"
+ "st1 { v10.8h }, [x15], #0x10\n"
+ "st1 { v14.8h }, [x25], #0x10\n"
+ "st1 { v18.8h }, [x24], #0x10\n"
+ "st1 { v22.8h }, [x23], #0x10\n"
+ "st1 { v26.8h }, [x22], #0x10\n"
+ "st1 { v30.8h }, [x21], #0x10\n"
+ "tbz x8, #2, 278f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x8, #1, 277f\n"
+ "st1 { v11.s }[2], [x15], #0x4\n"
+ "st1 { v15.s }[2], [x25], #0x4\n"
+ "st1 { v19.s }[2], [x24], #0x4\n"
+ "st1 { v23.s }[2], [x23], #0x4\n"
+ "st1 { v27.s }[2], [x22], #0x4\n"
+ "st1 { v31.s }[2], [x21], #0x4\n"
+ "tbz x8, #0, 292f\n"
+ "st1 { v11.h }[6], [x15]\n"
+ "st1 { v15.h }[6], [x25]\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "st1 { v23.h }[6], [x23]\n"
+ "st1 { v27.h }[6], [x22]\n"
+ "st1 { v31.h }[6], [x21]\n"
+ "b 292f\n"
+ "277:" // Height 6: Partial direct writeback: partial_1_28
+ "tbz x8, #0, 292f\n"
+ "st1 { v11.h }[4], [x15]\n"
+ "st1 { v15.h }[4], [x25]\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "st1 { v23.h }[4], [x23]\n"
+ "st1 { v27.h }[4], [x22]\n"
+ "st1 { v31.h }[4], [x21]\n"
+ "b 292f\n"
+ "278:" // Height 6: Partial direct writeback: partial_2_24
+ "tbz x8, #1, 279f\n"
+ "str s11, [x15], #0x4\n"
+ "str s15, [x25], #0x4\n"
+ "str s19, [x24], #0x4\n"
+ "str s23, [x23], #0x4\n"
+ "str s27, [x22], #0x4\n"
+ "str s31, [x21], #0x4\n"
+ "tbz x8, #0, 292f\n"
+ "st1 { v11.h }[2], [x15]\n"
+ "st1 { v15.h }[2], [x25]\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "st1 { v23.h }[2], [x23]\n"
+ "st1 { v27.h }[2], [x22]\n"
+ "st1 { v31.h }[2], [x21]\n"
+ "b 292f\n"
+ "279:" // Height 6: Partial direct writeback: partial_1_24
+ "tbz x8, #0, 292f\n"
+ "str h11, [x15, #0x0]\n"
+ "str h15, [x25, #0x0]\n"
+ "str h19, [x24, #0x0]\n"
+ "str h23, [x23, #0x0]\n"
+ "str h27, [x22, #0x0]\n"
+ "str h31, [x21, #0x0]\n"
+ "b 292f\n"
+ "280:" // Height 6: Partial direct writeback: partial_4_16
+ "tbz x8, #2, 282f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x8, #1, 281f\n"
+ "st1 { v10.s }[2], [x15], #0x4\n"
+ "st1 { v14.s }[2], [x25], #0x4\n"
+ "st1 { v18.s }[2], [x24], #0x4\n"
+ "st1 { v22.s }[2], [x23], #0x4\n"
+ "st1 { v26.s }[2], [x22], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
+ "tbz x8, #0, 292f\n"
+ "st1 { v10.h }[6], [x15]\n"
+ "st1 { v14.h }[6], [x25]\n"
+ "st1 { v18.h }[6], [x24]\n"
+ "st1 { v22.h }[6], [x23]\n"
+ "st1 { v26.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "b 292f\n"
+ "281:" // Height 6: Partial direct writeback: partial_1_20
+ "tbz x8, #0, 292f\n"
+ "st1 { v10.h }[4], [x15]\n"
+ "st1 { v14.h }[4], [x25]\n"
+ "st1 { v18.h }[4], [x24]\n"
+ "st1 { v22.h }[4], [x23]\n"
+ "st1 { v26.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "b 292f\n"
+ "282:" // Height 6: Partial direct writeback: partial_2_16
+ "tbz x8, #1, 283f\n"
+ "str s10, [x15], #0x4\n"
+ "str s14, [x25], #0x4\n"
+ "str s18, [x24], #0x4\n"
+ "str s22, [x23], #0x4\n"
+ "str s26, [x22], #0x4\n"
+ "str s30, [x21], #0x4\n"
+ "tbz x8, #0, 292f\n"
+ "st1 { v10.h }[2], [x15]\n"
+ "st1 { v14.h }[2], [x25]\n"
+ "st1 { v18.h }[2], [x24]\n"
+ "st1 { v22.h }[2], [x23]\n"
+ "st1 { v26.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "b 292f\n"
+ "283:" // Height 6: Partial direct writeback: partial_1_16
+ "tbz x8, #0, 292f\n"
+ "str h10, [x15, #0x0]\n"
+ "str h14, [x25, #0x0]\n"
+ "str h18, [x24, #0x0]\n"
+ "str h22, [x23, #0x0]\n"
+ "str h26, [x22, #0x0]\n"
+ "str h30, [x21, #0x0]\n"
+ "b 292f\n"
+ "284:" // Height 6: Partial direct writeback: partial_8_0
+ "tbz x8, #3, 288f\n"
+ "st1 { v8.8h }, [x15], #0x10\n"
+ "st1 { v12.8h }, [x25], #0x10\n"
+ "st1 { v16.8h }, [x24], #0x10\n"
+ "st1 { v20.8h }, [x23], #0x10\n"
+ "st1 { v24.8h }, [x22], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
+ "tbz x8, #2, 286f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x8, #1, 285f\n"
+ "st1 { v9.s }[2], [x15], #0x4\n"
+ "st1 { v13.s }[2], [x25], #0x4\n"
+ "st1 { v17.s }[2], [x24], #0x4\n"
+ "st1 { v21.s }[2], [x23], #0x4\n"
+ "st1 { v25.s }[2], [x22], #0x4\n"
+ "st1 { v29.s }[2], [x21], #0x4\n"
+ "tbz x8, #0, 292f\n"
+ "st1 { v9.h }[6], [x15]\n"
+ "st1 { v13.h }[6], [x25]\n"
+ "st1 { v17.h }[6], [x24]\n"
+ "st1 { v21.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "b 292f\n"
+ "285:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 292f\n"
+ "st1 { v9.h }[4], [x15]\n"
+ "st1 { v13.h }[4], [x25]\n"
+ "st1 { v17.h }[4], [x24]\n"
+ "st1 { v21.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "b 292f\n"
+ "286:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 287f\n"
+ "str s9, [x15], #0x4\n"
+ "str s13, [x25], #0x4\n"
+ "str s17, [x24], #0x4\n"
+ "str s21, [x23], #0x4\n"
+ "str s25, [x22], #0x4\n"
+ "str s29, [x21], #0x4\n"
+ "tbz x8, #0, 292f\n"
+ "st1 { v9.h }[2], [x15]\n"
+ "st1 { v13.h }[2], [x25]\n"
+ "st1 { v17.h }[2], [x24]\n"
+ "st1 { v21.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "b 292f\n"
+ "287:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 292f\n"
+ "str h9, [x15, #0x0]\n"
+ "str h13, [x25, #0x0]\n"
+ "str h17, [x24, #0x0]\n"
+ "str h21, [x23, #0x0]\n"
+ "str h25, [x22, #0x0]\n"
+ "str h29, [x21, #0x0]\n"
+ "b 292f\n"
+ "288:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 290f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x8, #1, 289f\n"
+ "st1 { v8.s }[2], [x15], #0x4\n"
+ "st1 { v12.s }[2], [x25], #0x4\n"
+ "st1 { v16.s }[2], [x24], #0x4\n"
+ "st1 { v20.s }[2], [x23], #0x4\n"
+ "st1 { v24.s }[2], [x22], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x8, #0, 292f\n"
+ "st1 { v8.h }[6], [x15]\n"
+ "st1 { v12.h }[6], [x25]\n"
+ "st1 { v16.h }[6], [x24]\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "st1 { v24.h }[6], [x22]\n"
+ "st1 { v28.h }[6], [x21]\n"
+ "b 292f\n"
+ "289:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 292f\n"
+ "st1 { v8.h }[4], [x15]\n"
+ "st1 { v12.h }[4], [x25]\n"
+ "st1 { v16.h }[4], [x24]\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "st1 { v24.h }[4], [x22]\n"
+ "st1 { v28.h }[4], [x21]\n"
+ "b 292f\n"
+ "290:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 291f\n"
+ "str s8, [x15], #0x4\n"
+ "str s12, [x25], #0x4\n"
+ "str s16, [x24], #0x4\n"
+ "str s20, [x23], #0x4\n"
+ "str s24, [x22], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x8, #0, 292f\n"
+ "st1 { v8.h }[2], [x15]\n"
+ "st1 { v12.h }[2], [x25]\n"
+ "st1 { v16.h }[2], [x24]\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "st1 { v24.h }[2], [x22]\n"
+ "st1 { v28.h }[2], [x21]\n"
+ "b 292f\n"
+ "291:" // Height 6: Partial direct writeback: partial_1_0
+ "str h8, [x15, #0x0]\n"
+ "str h12, [x25, #0x0]\n"
+ "str h16, [x24, #0x0]\n"
+ "str h20, [x23, #0x0]\n"
+ "str h24, [x22, #0x0]\n"
+ "str h28, [x21, #0x0]\n"
+ "292:" // Height 6: Partial direct writeback: Done
+ "b 294f\n"
+ "293:" // Height 6: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "294:" // Height 6: Writeback done
+ "subs x8, x8, #0x20\n"
+ "bgt 247b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 296f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 295f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "295:" // Update direct input
+ "mov x19, #0xc\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "296:" // Exit
+
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
index 32e341566f..6e51773166 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -27,6 +27,7 @@
#include "../../utils.hpp"
#include <cassert>
+#include <limits>
namespace arm_gemm {
@@ -96,342 +97,336 @@ void a64_hybrid_fp16_mla_6x32 (
#endif
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 251f\n"
+ "bge 246f\n"
"cmp %x[M], #0x4\n"
- "bgt 201f\n"
- "beq 151f\n"
+ "bgt 197f\n"
+ "beq 148f\n"
"cmp %x[M], #0x2\n"
- "bgt 101f\n"
- "beq 51f\n"
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[bias]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
- "cbz x14, 4f\n"
- "ldr q8, [x14, #0x0]\n"
- "ldr q9, [x14, #0x10]\n"
- "ldr q10, [x14, #0x20]\n"
- "ldr q11, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
- "b 23f\n"
- "4:" // Height 1: no bias
- "tbz %x[flags], #0, 22f\n"
- "cmp x16, #0x20\n"
- "bge 21f\n"
- "tbz x16, #4, 12f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v9.8h }, [x13], #0x10\n"
- "tbz x16, #3, 8f\n"
- "ld1 { v10.8h }, [x13], #0x10\n"
- "tbz x16, #2, 6f\n"
- "ldr d11, [x13], #0x8\n"
- "tbz x16, #1, 5f\n"
+ "bgt 99f\n"
+ "beq 50f\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[bias]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "cbz x9, 3f\n"
+ "ldr q8, [x9, #0x0]\n"
+ "ldr q9, [x9, #0x10]\n"
+ "ldr q10, [x9, #0x20]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
+ "b 22f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 21f\n"
+ "cmp x11, #0x20\n"
+ "bge 20f\n"
+ "tbz x11, #4, 11f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v9.8h }, [x28], #0x10\n"
+ "tbz x11, #3, 7f\n"
+ "ld1 { v10.8h }, [x28], #0x10\n"
+ "tbz x11, #2, 5f\n"
+ "ldr d11, [x28], #0x8\n"
+ "tbz x11, #1, 4f\n"
"mov x19, #0x3c\n"
- "ld1 { v11.s }[2], [x13], #0x4\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v11.h }[6], [x13]\n"
- "b 20f\n"
- "5:" // Height 1: Partial accumulate: partial_1_28
+ "ld1 { v11.s }[2], [x28], #0x4\n"
+ "tbz x11, #0, 19f\n"
+ "ld1 { v11.h }[6], [x28]\n"
+ "b 19f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_28
"mov x19, #0x38\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v11.h }[4], [x13]\n"
- "b 20f\n"
- "6:" // Height 1: Partial accumulate: partial_2_24
- "tbz x16, #1, 7f\n"
- "ldr s11, [x13], #0x4\n"
+ "tbz x11, #0, 19f\n"
+ "ld1 { v11.h }[4], [x28]\n"
+ "b 19f\n"
+ "5:" // Height 1: Partial accumulate: partial_2_24
+ "tbz x11, #1, 6f\n"
+ "ldr s11, [x28], #0x4\n"
"mov x19, #0x34\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v11.h }[2], [x13]\n"
- "b 20f\n"
- "7:" // Height 1: Partial accumulate: partial_1_24
+ "tbz x11, #0, 19f\n"
+ "ld1 { v11.h }[2], [x28]\n"
+ "b 19f\n"
+ "6:" // Height 1: Partial accumulate: partial_1_24
"mov x19, #0x30\n"
- "tbz x16, #0, 20f\n"
- "ldr h11, [x13, #0x0]\n"
- "b 20f\n"
- "8:" // Height 1: Partial accumulate: partial_4_16
- "tbz x16, #2, 10f\n"
- "ldr d10, [x13], #0x8\n"
- "tbz x16, #1, 9f\n"
+ "tbz x11, #0, 19f\n"
+ "ldr h11, [x28, #0x0]\n"
+ "b 19f\n"
+ "7:" // Height 1: Partial accumulate: partial_4_16
+ "tbz x11, #2, 9f\n"
+ "ldr d10, [x28], #0x8\n"
+ "tbz x11, #1, 8f\n"
+ "ld1 { v10.s }[2], [x28], #0x4\n"
"mov x19, #0x2c\n"
- "ld1 { v10.s }[2], [x13], #0x4\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v10.h }[6], [x13]\n"
- "b 20f\n"
- "9:" // Height 1: Partial accumulate: partial_1_20
+ "tbz x11, #0, 19f\n"
+ "ld1 { v10.h }[6], [x28]\n"
+ "b 19f\n"
+ "8:" // Height 1: Partial accumulate: partial_1_20
"mov x19, #0x28\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v10.h }[4], [x13]\n"
- "b 20f\n"
- "10:" // Height 1: Partial accumulate: partial_2_16
- "tbz x16, #1, 11f\n"
- "ldr s10, [x13], #0x4\n"
+ "tbz x11, #0, 19f\n"
+ "ld1 { v10.h }[4], [x28]\n"
+ "b 19f\n"
+ "9:" // Height 1: Partial accumulate: partial_2_16
+ "tbz x11, #1, 10f\n"
+ "ldr s10, [x28], #0x4\n"
"mov x19, #0x24\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v10.h }[2], [x13]\n"
- "b 20f\n"
- "11:" // Height 1: Partial accumulate: partial_1_16
+ "tbz x11, #0, 19f\n"
+ "ld1 { v10.h }[2], [x28]\n"
+ "b 19f\n"
+ "10:" // Height 1: Partial accumulate: partial_1_16
"mov x19, #0x20\n"
- "tbz x16, #0, 20f\n"
- "ldr h10, [x13, #0x0]\n"
- "b 20f\n"
- "12:" // Height 1: Partial accumulate: partial_8_0
- "tbz x16, #3, 16f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "tbz x16, #2, 14f\n"
- "ldr d9, [x13], #0x8\n"
- "tbz x16, #1, 13f\n"
+ "tbz x11, #0, 19f\n"
+ "ldr h10, [x28, #0x0]\n"
+ "b 19f\n"
+ "11:" // Height 1: Partial accumulate: partial_8_0
+ "tbz x11, #3, 15f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "tbz x11, #2, 13f\n"
+ "ldr d9, [x28], #0x8\n"
+ "tbz x11, #1, 12f\n"
"mov x19, #0x1c\n"
- "ld1 { v9.s }[2], [x13], #0x4\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v9.h }[6], [x13]\n"
- "b 20f\n"
- "13:" // Height 1: Partial accumulate: partial_1_12
+ "ld1 { v9.s }[2], [x28], #0x4\n"
+ "tbz x11, #0, 19f\n"
+ "ld1 { v9.h }[6], [x28]\n"
+ "b 19f\n"
+ "12:" // Height 1: Partial accumulate: partial_1_12
"mov x19, #0x18\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v9.h }[4], [x13]\n"
- "b 20f\n"
- "14:" // Height 1: Partial accumulate: partial_2_8
- "tbz x16, #1, 15f\n"
- "ldr s9, [x13], #0x4\n"
+ "tbz x11, #0, 19f\n"
+ "ld1 { v9.h }[4], [x28]\n"
+ "b 19f\n"
+ "13:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x11, #1, 14f\n"
+ "ldr s9, [x28], #0x4\n"
"mov x19, #0x14\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v9.h }[2], [x13]\n"
- "b 20f\n"
- "15:" // Height 1: Partial accumulate: partial_1_8
+ "tbz x11, #0, 19f\n"
+ "ld1 { v9.h }[2], [x28]\n"
+ "b 19f\n"
+ "14:" // Height 1: Partial accumulate: partial_1_8
"mov x19, #0x10\n"
- "tbz x16, #0, 20f\n"
- "ldr h9, [x13, #0x0]\n"
- "b 20f\n"
- "16:" // Height 1: Partial accumulate: partial_4_0
- "tbz x16, #2, 18f\n"
- "ldr d8, [x13], #0x8\n"
- "tbz x16, #1, 17f\n"
+ "tbz x11, #0, 19f\n"
+ "ldr h9, [x28, #0x0]\n"
+ "b 19f\n"
+ "15:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x11, #2, 17f\n"
+ "ldr d8, [x28], #0x8\n"
+ "tbz x11, #1, 16f\n"
+ "ld1 { v8.s }[2], [x28], #0x4\n"
"mov x19, #0xc\n"
- "ld1 { v8.s }[2], [x13], #0x4\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v8.h }[6], [x13]\n"
- "b 20f\n"
- "17:" // Height 1: Partial accumulate: partial_1_4
+ "tbz x11, #0, 19f\n"
+ "ld1 { v8.h }[6], [x28]\n"
+ "b 19f\n"
+ "16:" // Height 1: Partial accumulate: partial_1_4
"mov x19, #0x8\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v8.h }[4], [x13]\n"
- "b 20f\n"
- "18:" // Height 1: Partial accumulate: partial_2_0
- "tbz x16, #1, 19f\n"
- "ldr s8, [x13], #0x4\n"
+ "tbz x11, #0, 19f\n"
+ "ld1 { v8.h }[4], [x28]\n"
+ "b 19f\n"
+ "17:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x11, #1, 18f\n"
+ "ldr s8, [x28], #0x4\n"
"mov x19, #0x4\n"
- "tbz x16, #0, 20f\n"
- "ld1 { v8.h }[2], [x13]\n"
- "b 20f\n"
- "19:" // Height 1: Partial accumulate: partial_1_0
+ "tbz x11, #0, 19f\n"
+ "ld1 { v8.h }[2], [x28]\n"
+ "b 19f\n"
+ "18:" // Height 1: Partial accumulate: partial_1_0
+ "ldr h8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr h8, [x13, #0x0]\n"
- "20:" // Height 1: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "b 23f\n"
- "21:" // Height 1: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "b 23f\n"
- "22:" // Height 1: no accumulate
+ "19:" // Height 1: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 22f\n"
+ "20:" // Height 1: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "b 22f\n"
+ "21:" // Height 1: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
"movi v11.16b, #0x0\n"
- "23:" // Height 1: setup done
- "mov x12, #0x0\n"
- "24:" // Height 1: String loop
+ "22:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "23:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 25f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 24f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 26f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 25f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "b 26f\n"
- "25:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "26:" // Height 1: input setup done
- "cmp x11, #0x8\n"
- "blt 29f\n"
- "cmp x11, #0x10\n"
+ "add x25, x25, x19, LSL #1\n"
+ "b 25f\n"
+ "24:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "25:" // Height 1: input setup done
+ "cmp x26, #0x8\n"
"blt 28f\n"
- "27:" // Height 1: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "blt 27f\n"
+ "26:" // Height 1: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "sub x26, x26, #0x8\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "cmp x26, #0x10\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q6, [x10, #0x40]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
- "sub x11, x11, #0x8\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
- "cmp x11, #0x10\n"
+ "ldr q7, [x10, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
- "add x15, x15, #0x200\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.8h, v7.8h, v0.h[7]\n"
- "bge 27b\n"
- "28:" // Height 1: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "bge 26b\n"
+ "27:" // Height 1: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
- "add x15, x15, #0x200\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
"fmla v11.8h, v7.8h, v0.h[7]\n"
- "29:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 31f\n"
- "30:" // Height 1: Multiply loop: Odd block loop
- "ldr h0, [x10], #0x2\n"
- "ldr q6, [x15, #0x0]\n"
+ "28:" // Height 1: Multiply loop: Main loop skip
+ "cbz x26, 30f\n"
+ "29:" // Height 1: Multiply loop: Odd block loop
+ "ldr h0, [x25], #0x2\n"
+ "sub x26, x26, #0x1\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "sub x11, x11, #0x1\n"
- "add x15, x15, #0x40\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
- "cbnz x11, 30b\n"
- "31:" // Height 1: Multiply loop: No odd multiplies
+ "cbnz x26, 29b\n"
+ "30:" // Height 1: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 24b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "tbz %x[flags], #1, 32f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 23b\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "tbz %x[flags], #1, 31f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.8h }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -444,298 +439,290 @@ void a64_hybrid_fp16_mla_6x32 (
"fmax v9.8h, v9.8h, v1.8h\n"
"fmax v10.8h, v10.8h, v1.8h\n"
"fmax v11.8h, v11.8h, v1.8h\n"
- "32:" // Height 1: No activation
- "cmp x16, #0x20\n"
- "bge 49f\n"
- "tbz x16, #4, 40f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v9.8h }, [x13], #0x10\n"
- "tbz x16, #3, 36f\n"
- "st1 { v10.8h }, [x13], #0x10\n"
- "tbz x16, #2, 34f\n"
- "str d11, [x13], #0x8\n"
- "tbz x16, #1, 33f\n"
- "st1 { v11.s }[2], [x13], #0x4\n"
- "tbz x16, #0, 48f\n"
- "st1 { v11.h }[6], [x13]\n"
- "b 48f\n"
- "33:" // Height 1: Partial direct writeback: partial_1_28
- "tbz x16, #0, 48f\n"
- "st1 { v11.h }[4], [x13]\n"
- "b 48f\n"
- "34:" // Height 1: Partial direct writeback: partial_2_24
- "tbz x16, #1, 35f\n"
- "str s11, [x13], #0x4\n"
- "tbz x16, #0, 48f\n"
- "st1 { v11.h }[2], [x13]\n"
- "b 48f\n"
- "35:" // Height 1: Partial direct writeback: partial_1_24
- "tbz x16, #0, 48f\n"
- "str h11, [x13, #0x0]\n"
- "b 48f\n"
- "36:" // Height 1: Partial direct writeback: partial_4_16
- "tbz x16, #2, 38f\n"
- "str d10, [x13], #0x8\n"
- "tbz x16, #1, 37f\n"
- "st1 { v10.s }[2], [x13], #0x4\n"
- "tbz x16, #0, 48f\n"
- "st1 { v10.h }[6], [x13]\n"
- "b 48f\n"
- "37:" // Height 1: Partial direct writeback: partial_1_20
- "tbz x16, #0, 48f\n"
- "st1 { v10.h }[4], [x13]\n"
- "b 48f\n"
- "38:" // Height 1: Partial direct writeback: partial_2_16
- "tbz x16, #1, 39f\n"
- "str s10, [x13], #0x4\n"
- "tbz x16, #0, 48f\n"
- "st1 { v10.h }[2], [x13]\n"
- "b 48f\n"
- "39:" // Height 1: Partial direct writeback: partial_1_16
- "tbz x16, #0, 48f\n"
- "str h10, [x13, #0x0]\n"
- "b 48f\n"
- "40:" // Height 1: Partial direct writeback: partial_8_0
- "tbz x16, #3, 44f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "tbz x16, #2, 42f\n"
- "str d9, [x13], #0x8\n"
- "tbz x16, #1, 41f\n"
- "st1 { v9.s }[2], [x13], #0x4\n"
- "tbz x16, #0, 48f\n"
- "st1 { v9.h }[6], [x13]\n"
- "b 48f\n"
- "41:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x16, #0, 48f\n"
- "st1 { v9.h }[4], [x13]\n"
- "b 48f\n"
- "42:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x16, #1, 43f\n"
- "str s9, [x13], #0x4\n"
- "tbz x16, #0, 48f\n"
- "st1 { v9.h }[2], [x13]\n"
- "b 48f\n"
- "43:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x16, #0, 48f\n"
- "str h9, [x13, #0x0]\n"
- "b 48f\n"
- "44:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x16, #2, 46f\n"
- "str d8, [x13], #0x8\n"
- "tbz x16, #1, 45f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "tbz x16, #0, 48f\n"
- "st1 { v8.h }[6], [x13]\n"
- "b 48f\n"
- "45:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x16, #0, 48f\n"
- "st1 { v8.h }[4], [x13]\n"
- "b 48f\n"
- "46:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x16, #1, 47f\n"
- "str s8, [x13], #0x4\n"
- "tbz x16, #0, 48f\n"
- "st1 { v8.h }[2], [x13]\n"
- "b 48f\n"
- "47:" // Height 1: Partial direct writeback: partial_1_0
- "str h8, [x13, #0x0]\n"
- "48:" // Height 1: Partial direct writeback: Done
- "b 50f\n"
- "49:" // Height 1: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "add x13, x13, #0x40\n"
- "50:" // Height 1: Writeback done
- "subs x16, x16, #0x20\n"
- "bgt 3b\n"
- "b 302f\n"
- "51:" // Height 2
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 52f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #1\n"
- "b 53f\n"
- "52:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "53:" // Height 2: Column loop
- "cbz x14, 54f\n"
- "ldr q8, [x14, #0x0]\n"
+ "31:" // Height 1: No activation
+ "cmp x11, #0x20\n"
+ "bge 48f\n"
+ "tbz x11, #4, 39f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v9.8h }, [x28], #0x10\n"
+ "tbz x11, #3, 35f\n"
+ "st1 { v10.8h }, [x28], #0x10\n"
+ "tbz x11, #2, 33f\n"
+ "str d11, [x28], #0x8\n"
+ "tbz x11, #1, 32f\n"
+ "st1 { v11.s }[2], [x28], #0x4\n"
+ "tbz x11, #0, 47f\n"
+ "st1 { v11.h }[6], [x28]\n"
+ "b 47f\n"
+ "32:" // Height 1: Partial direct writeback: partial_1_28
+ "tbz x11, #0, 47f\n"
+ "st1 { v11.h }[4], [x28]\n"
+ "b 47f\n"
+ "33:" // Height 1: Partial direct writeback: partial_2_24
+ "tbz x11, #1, 34f\n"
+ "str s11, [x28], #0x4\n"
+ "tbz x11, #0, 47f\n"
+ "st1 { v11.h }[2], [x28]\n"
+ "b 47f\n"
+ "34:" // Height 1: Partial direct writeback: partial_1_24
+ "tbz x11, #0, 47f\n"
+ "str h11, [x28, #0x0]\n"
+ "b 47f\n"
+ "35:" // Height 1: Partial direct writeback: partial_4_16
+ "tbz x11, #2, 37f\n"
+ "str d10, [x28], #0x8\n"
+ "tbz x11, #1, 36f\n"
+ "st1 { v10.s }[2], [x28], #0x4\n"
+ "tbz x11, #0, 47f\n"
+ "st1 { v10.h }[6], [x28]\n"
+ "b 47f\n"
+ "36:" // Height 1: Partial direct writeback: partial_1_20
+ "tbz x11, #0, 47f\n"
+ "st1 { v10.h }[4], [x28]\n"
+ "b 47f\n"
+ "37:" // Height 1: Partial direct writeback: partial_2_16
+ "tbz x11, #1, 38f\n"
+ "str s10, [x28], #0x4\n"
+ "tbz x11, #0, 47f\n"
+ "st1 { v10.h }[2], [x28]\n"
+ "b 47f\n"
+ "38:" // Height 1: Partial direct writeback: partial_1_16
+ "tbz x11, #0, 47f\n"
+ "str h10, [x28, #0x0]\n"
+ "b 47f\n"
+ "39:" // Height 1: Partial direct writeback: partial_8_0
+ "tbz x11, #3, 43f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "tbz x11, #2, 41f\n"
+ "str d9, [x28], #0x8\n"
+ "tbz x11, #1, 40f\n"
+ "st1 { v9.s }[2], [x28], #0x4\n"
+ "tbz x11, #0, 47f\n"
+ "st1 { v9.h }[6], [x28]\n"
+ "b 47f\n"
+ "40:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 47f\n"
+ "st1 { v9.h }[4], [x28]\n"
+ "b 47f\n"
+ "41:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 42f\n"
+ "str s9, [x28], #0x4\n"
+ "tbz x11, #0, 47f\n"
+ "st1 { v9.h }[2], [x28]\n"
+ "b 47f\n"
+ "42:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 47f\n"
+ "str h9, [x28, #0x0]\n"
+ "b 47f\n"
+ "43:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 45f\n"
+ "str d8, [x28], #0x8\n"
+ "tbz x11, #1, 44f\n"
+ "st1 { v8.s }[2], [x28], #0x4\n"
+ "tbz x11, #0, 47f\n"
+ "st1 { v8.h }[6], [x28]\n"
+ "b 47f\n"
+ "44:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 47f\n"
+ "st1 { v8.h }[4], [x28]\n"
+ "b 47f\n"
+ "45:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 46f\n"
+ "str s8, [x28], #0x4\n"
+ "tbz x11, #0, 47f\n"
+ "st1 { v8.h }[2], [x28]\n"
+ "b 47f\n"
+ "46:" // Height 1: Partial direct writeback: partial_1_0
+ "str h8, [x28, #0x0]\n"
+ "47:" // Height 1: Partial direct writeback: Done
+ "b 49f\n"
+ "48:" // Height 1: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "49:" // Height 1: Writeback done
+ "subs x11, x11, #0x20\n"
+ "bgt 2b\n"
+ "b 296f\n"
+ "50:" // Height 2
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "51:" // Height 2: Column loop
+ "cbz x9, 52f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q9, [x9, #0x10]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v13.16b, v9.16b\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v14.16b, v10.16b\n"
- "add x14, x14, #0x40\n"
"mov v15.16b, v11.16b\n"
- "b 73f\n"
- "54:" // Height 2: no bias
- "tbz %x[flags], #0, 72f\n"
- "cmp x16, #0x20\n"
- "bge 71f\n"
- "tbz x16, #4, 62f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x9], #0x10\n"
- "tbz x16, #3, 58f\n"
- "ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x9], #0x10\n"
- "tbz x16, #2, 56f\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "tbz x16, #1, 55f\n"
+ "b 71f\n"
+ "52:" // Height 2: no bias
+ "tbz %x[flags], #0, 70f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x20\n"
+ "add x24, x28, x19, LSL #1\n"
+ "bge 69f\n"
+ "tbz x11, #4, 60f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "ld1 { v9.8h }, [x28], #0x10\n"
+ "ld1 { v13.8h }, [x24], #0x10\n"
+ "tbz x11, #3, 56f\n"
+ "ld1 { v10.8h }, [x28], #0x10\n"
+ "ld1 { v14.8h }, [x24], #0x10\n"
+ "tbz x11, #2, 54f\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "tbz x11, #1, 53f\n"
"mov x19, #0x3c\n"
- "ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x9]\n"
- "b 70f\n"
- "55:" // Height 2: Partial accumulate: partial_1_28
+ "ld1 { v11.s }[2], [x28], #0x4\n"
+ "ld1 { v15.s }[2], [x24], #0x4\n"
+ "tbz x11, #0, 68f\n"
+ "ld1 { v11.h }[6], [x28]\n"
+ "ld1 { v15.h }[6], [x24]\n"
+ "b 68f\n"
+ "53:" // Height 2: Partial accumulate: partial_1_28
"mov x19, #0x38\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x9]\n"
- "b 70f\n"
- "56:" // Height 2: Partial accumulate: partial_2_24
- "tbz x16, #1, 57f\n"
- "ldr s11, [x13], #0x4\n"
- "ldr s15, [x9], #0x4\n"
+ "tbz x11, #0, 68f\n"
+ "ld1 { v11.h }[4], [x28]\n"
+ "ld1 { v15.h }[4], [x24]\n"
+ "b 68f\n"
+ "54:" // Height 2: Partial accumulate: partial_2_24
+ "tbz x11, #1, 55f\n"
+ "ldr s11, [x28], #0x4\n"
+ "ldr s15, [x24], #0x4\n"
"mov x19, #0x34\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x9]\n"
- "b 70f\n"
- "57:" // Height 2: Partial accumulate: partial_1_24
+ "tbz x11, #0, 68f\n"
+ "ld1 { v11.h }[2], [x28]\n"
+ "ld1 { v15.h }[2], [x24]\n"
+ "b 68f\n"
+ "55:" // Height 2: Partial accumulate: partial_1_24
"mov x19, #0x30\n"
- "tbz x16, #0, 70f\n"
- "ldr h11, [x13, #0x0]\n"
- "ldr h15, [x9, #0x0]\n"
- "b 70f\n"
- "58:" // Height 2: Partial accumulate: partial_4_16
- "tbz x16, #2, 60f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "tbz x16, #1, 59f\n"
+ "tbz x11, #0, 68f\n"
+ "ldr h11, [x28, #0x0]\n"
+ "ldr h15, [x24, #0x0]\n"
+ "b 68f\n"
+ "56:" // Height 2: Partial accumulate: partial_4_16
+ "tbz x11, #2, 58f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
+ "tbz x11, #1, 57f\n"
"mov x19, #0x2c\n"
- "ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x9], #0x4\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x9]\n"
- "b 70f\n"
- "59:" // Height 2: Partial accumulate: partial_1_20
+ "ld1 { v10.s }[2], [x28], #0x4\n"
+ "ld1 { v14.s }[2], [x24], #0x4\n"
+ "tbz x11, #0, 68f\n"
+ "ld1 { v10.h }[6], [x28]\n"
+ "ld1 { v14.h }[6], [x24]\n"
+ "b 68f\n"
+ "57:" // Height 2: Partial accumulate: partial_1_20
"mov x19, #0x28\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x9]\n"
- "b 70f\n"
- "60:" // Height 2: Partial accumulate: partial_2_16
- "tbz x16, #1, 61f\n"
- "ldr s10, [x13], #0x4\n"
- "ldr s14, [x9], #0x4\n"
+ "tbz x11, #0, 68f\n"
+ "ld1 { v10.h }[4], [x28]\n"
+ "ld1 { v14.h }[4], [x24]\n"
+ "b 68f\n"
+ "58:" // Height 2: Partial accumulate: partial_2_16
+ "tbz x11, #1, 59f\n"
+ "ldr s10, [x28], #0x4\n"
+ "ldr s14, [x24], #0x4\n"
"mov x19, #0x24\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x9]\n"
- "b 70f\n"
- "61:" // Height 2: Partial accumulate: partial_1_16
+ "tbz x11, #0, 68f\n"
+ "ld1 { v10.h }[2], [x28]\n"
+ "ld1 { v14.h }[2], [x24]\n"
+ "b 68f\n"
+ "59:" // Height 2: Partial accumulate: partial_1_16
"mov x19, #0x20\n"
- "tbz x16, #0, 70f\n"
- "ldr h10, [x13, #0x0]\n"
- "ldr h14, [x9, #0x0]\n"
- "b 70f\n"
- "62:" // Height 2: Partial accumulate: partial_8_0
- "tbz x16, #3, 66f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "tbz x16, #2, 64f\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "tbz x16, #1, 63f\n"
+ "tbz x11, #0, 68f\n"
+ "ldr h10, [x28, #0x0]\n"
+ "ldr h14, [x24, #0x0]\n"
+ "b 68f\n"
+ "60:" // Height 2: Partial accumulate: partial_8_0
+ "tbz x11, #3, 64f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "tbz x11, #2, 62f\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "tbz x11, #1, 61f\n"
"mov x19, #0x1c\n"
- "ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x9], #0x4\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x9]\n"
- "b 70f\n"
- "63:" // Height 2: Partial accumulate: partial_1_12
+ "ld1 { v9.s }[2], [x28], #0x4\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "tbz x11, #0, 68f\n"
+ "ld1 { v9.h }[6], [x28]\n"
+ "ld1 { v13.h }[6], [x24]\n"
+ "b 68f\n"
+ "61:" // Height 2: Partial accumulate: partial_1_12
"mov x19, #0x18\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x9]\n"
- "b 70f\n"
- "64:" // Height 2: Partial accumulate: partial_2_8
- "tbz x16, #1, 65f\n"
- "ldr s9, [x13], #0x4\n"
- "ldr s13, [x9], #0x4\n"
+ "tbz x11, #0, 68f\n"
+ "ld1 { v9.h }[4], [x28]\n"
+ "ld1 { v13.h }[4], [x24]\n"
+ "b 68f\n"
+ "62:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x11, #1, 63f\n"
+ "ldr s9, [x28], #0x4\n"
+ "ldr s13, [x24], #0x4\n"
"mov x19, #0x14\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x9]\n"
- "b 70f\n"
- "65:" // Height 2: Partial accumulate: partial_1_8
+ "tbz x11, #0, 68f\n"
+ "ld1 { v9.h }[2], [x28]\n"
+ "ld1 { v13.h }[2], [x24]\n"
+ "b 68f\n"
+ "63:" // Height 2: Partial accumulate: partial_1_8
"mov x19, #0x10\n"
- "tbz x16, #0, 70f\n"
- "ldr h9, [x13, #0x0]\n"
- "ldr h13, [x9, #0x0]\n"
- "b 70f\n"
- "66:" // Height 2: Partial accumulate: partial_4_0
- "tbz x16, #2, 68f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "tbz x16, #1, 67f\n"
+ "tbz x11, #0, 68f\n"
+ "ldr h9, [x28, #0x0]\n"
+ "ldr h13, [x24, #0x0]\n"
+ "b 68f\n"
+ "64:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x11, #2, 66f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
+ "tbz x11, #1, 65f\n"
"mov x19, #0xc\n"
- "ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x9], #0x4\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x9]\n"
- "b 70f\n"
- "67:" // Height 2: Partial accumulate: partial_1_4
+ "ld1 { v8.s }[2], [x28], #0x4\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "tbz x11, #0, 68f\n"
+ "ld1 { v8.h }[6], [x28]\n"
+ "ld1 { v12.h }[6], [x24]\n"
+ "b 68f\n"
+ "65:" // Height 2: Partial accumulate: partial_1_4
"mov x19, #0x8\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x9]\n"
- "b 70f\n"
- "68:" // Height 2: Partial accumulate: partial_2_0
- "tbz x16, #1, 69f\n"
- "ldr s8, [x13], #0x4\n"
- "ldr s12, [x9], #0x4\n"
+ "tbz x11, #0, 68f\n"
+ "ld1 { v8.h }[4], [x28]\n"
+ "ld1 { v12.h }[4], [x24]\n"
+ "b 68f\n"
+ "66:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x11, #1, 67f\n"
+ "ldr s8, [x28], #0x4\n"
+ "ldr s12, [x24], #0x4\n"
"mov x19, #0x4\n"
- "tbz x16, #0, 70f\n"
- "ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x9]\n"
- "b 70f\n"
- "69:" // Height 2: Partial accumulate: partial_1_0
+ "tbz x11, #0, 68f\n"
+ "ld1 { v8.h }[2], [x28]\n"
+ "ld1 { v12.h }[2], [x24]\n"
+ "b 68f\n"
+ "67:" // Height 2: Partial accumulate: partial_1_0
+ "ldr h8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr h8, [x13, #0x0]\n"
- "ldr h12, [x9, #0x0]\n"
- "70:" // Height 2: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "b 73f\n"
- "71:" // Height 2: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "b 73f\n"
- "72:" // Height 2: no accumulate
+ "ldr h12, [x24, #0x0]\n"
+ "68:" // Height 2: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 71f\n"
+ "69:" // Height 2: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "b 71f\n"
+ "70:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -744,270 +731,272 @@ void a64_hybrid_fp16_mla_6x32 (
"movi v13.16b, #0x0\n"
"movi v14.16b, #0x0\n"
"movi v15.16b, #0x0\n"
- "73:" // Height 2: setup done
- "mov x12, #0x0\n"
- "74:" // Height 2: String loop
+ "71:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "72:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 75f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 73f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 76f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 74f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "b 76f\n"
- "75:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "76:" // Height 2: input setup done
- "cmp x11, #0x8\n"
- "blt 79f\n"
- "cmp x11, #0x10\n"
- "blt 78f\n"
- "77:" // Height 2: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 74f\n"
+ "73:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "74:" // Height 2: input setup done
+ "cmp x26, #0x8\n"
+ "blt 77f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 76f\n"
+ "75:" // Height 2: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x24, x24, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x8\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "cmp x26, #0x10\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "sub x11, x11, #0x8\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
- "cmp x11, #0x10\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
- "add x15, x15, #0x200\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.8h, v7.8h, v1.h[7]\n"
- "bge 77b\n"
- "78:" // Height 2: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "bge 75b\n"
+ "76:" // Height 2: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
- "add x15, x15, #0x200\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
"fmla v11.8h, v7.8h, v0.h[7]\n"
"fmla v15.8h, v7.8h, v1.h[7]\n"
- "79:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 81f\n"
- "80:" // Height 2: Multiply loop: Odd block loop
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr q6, [x15, #0x0]\n"
+ "77:" // Height 2: Multiply loop: Main loop skip
+ "cbz x26, 79f\n"
+ "78:" // Height 2: Multiply loop: Odd block loop
+ "ldr h0, [x25], #0x2\n"
+ "sub x26, x26, #0x1\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "sub x11, x11, #0x1\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
- "cbnz x11, 80b\n"
- "81:" // Height 2: Multiply loop: No odd multiplies
+ "cbnz x26, 78b\n"
+ "79:" // Height 2: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 74b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "tbz %x[flags], #1, 82f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 72b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "tbz %x[flags], #1, 80f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.8h }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1028,379 +1017,367 @@ void a64_hybrid_fp16_mla_6x32 (
"fmax v14.8h, v14.8h, v1.8h\n"
"fmin v15.8h, v15.8h, v0.8h\n"
"fmax v15.8h, v15.8h, v1.8h\n"
- "82:" // Height 2: No activation
- "cmp x16, #0x20\n"
- "bge 99f\n"
- "tbz x16, #4, 90f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "st1 { v13.8h }, [x9], #0x10\n"
- "tbz x16, #3, 86f\n"
- "st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x9], #0x10\n"
- "tbz x16, #2, 84f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "tbz x16, #1, 83f\n"
- "st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x9], #0x4\n"
- "tbz x16, #0, 98f\n"
- "st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x9]\n"
- "b 98f\n"
- "83:" // Height 2: Partial direct writeback: partial_1_28
- "tbz x16, #0, 98f\n"
- "st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x9]\n"
- "b 98f\n"
- "84:" // Height 2: Partial direct writeback: partial_2_24
- "tbz x16, #1, 85f\n"
- "str s11, [x13], #0x4\n"
- "str s15, [x9], #0x4\n"
- "tbz x16, #0, 98f\n"
- "st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x9]\n"
- "b 98f\n"
- "85:" // Height 2: Partial direct writeback: partial_1_24
- "tbz x16, #0, 98f\n"
- "str h11, [x13, #0x0]\n"
- "str h15, [x9, #0x0]\n"
- "b 98f\n"
- "86:" // Height 2: Partial direct writeback: partial_4_16
- "tbz x16, #2, 88f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "tbz x16, #1, 87f\n"
- "st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x9], #0x4\n"
- "tbz x16, #0, 98f\n"
- "st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x9]\n"
- "b 98f\n"
- "87:" // Height 2: Partial direct writeback: partial_1_20
- "tbz x16, #0, 98f\n"
- "st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x9]\n"
- "b 98f\n"
- "88:" // Height 2: Partial direct writeback: partial_2_16
- "tbz x16, #1, 89f\n"
- "str s10, [x13], #0x4\n"
- "str s14, [x9], #0x4\n"
- "tbz x16, #0, 98f\n"
- "st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x9]\n"
- "b 98f\n"
- "89:" // Height 2: Partial direct writeback: partial_1_16
- "tbz x16, #0, 98f\n"
- "str h10, [x13, #0x0]\n"
- "str h14, [x9, #0x0]\n"
- "b 98f\n"
- "90:" // Height 2: Partial direct writeback: partial_8_0
- "tbz x16, #3, 94f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "tbz x16, #2, 92f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "tbz x16, #1, 91f\n"
- "st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x9], #0x4\n"
- "tbz x16, #0, 98f\n"
- "st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x9]\n"
+ "80:" // Height 2: No activation
+ "cmp x11, #0x20\n"
+ "bge 97f\n"
+ "tbz x11, #4, 88f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v9.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "st1 { v13.8h }, [x24], #0x10\n"
+ "tbz x11, #3, 84f\n"
+ "st1 { v10.8h }, [x28], #0x10\n"
+ "st1 { v14.8h }, [x24], #0x10\n"
+ "tbz x11, #2, 82f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "tbz x11, #1, 81f\n"
+ "st1 { v11.s }[2], [x28], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "tbz x11, #0, 96f\n"
+ "st1 { v11.h }[6], [x28]\n"
+ "st1 { v15.h }[6], [x24]\n"
+ "b 96f\n"
+ "81:" // Height 2: Partial direct writeback: partial_1_28
+ "tbz x11, #0, 96f\n"
+ "st1 { v11.h }[4], [x28]\n"
+ "st1 { v15.h }[4], [x24]\n"
+ "b 96f\n"
+ "82:" // Height 2: Partial direct writeback: partial_2_24
+ "tbz x11, #1, 83f\n"
+ "str s11, [x28], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "tbz x11, #0, 96f\n"
+ "st1 { v11.h }[2], [x28]\n"
+ "st1 { v15.h }[2], [x24]\n"
+ "b 96f\n"
+ "83:" // Height 2: Partial direct writeback: partial_1_24
+ "tbz x11, #0, 96f\n"
+ "str h11, [x28, #0x0]\n"
+ "str h15, [x24, #0x0]\n"
+ "b 96f\n"
+ "84:" // Height 2: Partial direct writeback: partial_4_16
+ "tbz x11, #2, 86f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "tbz x11, #1, 85f\n"
+ "st1 { v10.s }[2], [x28], #0x4\n"
+ "st1 { v14.s }[2], [x24], #0x4\n"
+ "tbz x11, #0, 96f\n"
+ "st1 { v10.h }[6], [x28]\n"
+ "st1 { v14.h }[6], [x24]\n"
+ "b 96f\n"
+ "85:" // Height 2: Partial direct writeback: partial_1_20
+ "tbz x11, #0, 96f\n"
+ "st1 { v10.h }[4], [x28]\n"
+ "st1 { v14.h }[4], [x24]\n"
+ "b 96f\n"
+ "86:" // Height 2: Partial direct writeback: partial_2_16
+ "tbz x11, #1, 87f\n"
+ "str s10, [x28], #0x4\n"
+ "str s14, [x24], #0x4\n"
+ "tbz x11, #0, 96f\n"
+ "st1 { v10.h }[2], [x28]\n"
+ "st1 { v14.h }[2], [x24]\n"
+ "b 96f\n"
+ "87:" // Height 2: Partial direct writeback: partial_1_16
+ "tbz x11, #0, 96f\n"
+ "str h10, [x28, #0x0]\n"
+ "str h14, [x24, #0x0]\n"
+ "b 96f\n"
+ "88:" // Height 2: Partial direct writeback: partial_8_0
+ "tbz x11, #3, 92f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "tbz x11, #2, 90f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "tbz x11, #1, 89f\n"
+ "st1 { v9.s }[2], [x28], #0x4\n"
+ "st1 { v13.s }[2], [x24], #0x4\n"
+ "tbz x11, #0, 96f\n"
+ "st1 { v9.h }[6], [x28]\n"
+ "st1 { v13.h }[6], [x24]\n"
+ "b 96f\n"
+ "89:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 96f\n"
+ "st1 { v9.h }[4], [x28]\n"
+ "st1 { v13.h }[4], [x24]\n"
+ "b 96f\n"
+ "90:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 91f\n"
+ "str s9, [x28], #0x4\n"
+ "str s13, [x24], #0x4\n"
+ "tbz x11, #0, 96f\n"
+ "st1 { v9.h }[2], [x28]\n"
+ "st1 { v13.h }[2], [x24]\n"
+ "b 96f\n"
+ "91:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 96f\n"
+ "str h9, [x28, #0x0]\n"
+ "str h13, [x24, #0x0]\n"
+ "b 96f\n"
+ "92:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 94f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "tbz x11, #1, 93f\n"
+ "st1 { v8.s }[2], [x28], #0x4\n"
+ "st1 { v12.s }[2], [x24], #0x4\n"
+ "tbz x11, #0, 96f\n"
+ "st1 { v8.h }[6], [x28]\n"
+ "st1 { v12.h }[6], [x24]\n"
+ "b 96f\n"
+ "93:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 96f\n"
+ "st1 { v8.h }[4], [x28]\n"
+ "st1 { v12.h }[4], [x24]\n"
+ "b 96f\n"
+ "94:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 95f\n"
+ "str s8, [x28], #0x4\n"
+ "str s12, [x24], #0x4\n"
+ "tbz x11, #0, 96f\n"
+ "st1 { v8.h }[2], [x28]\n"
+ "st1 { v12.h }[2], [x24]\n"
+ "b 96f\n"
+ "95:" // Height 2: Partial direct writeback: partial_1_0
+ "str h8, [x28, #0x0]\n"
+ "str h12, [x24, #0x0]\n"
+ "96:" // Height 2: Partial direct writeback: Done
"b 98f\n"
- "91:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x16, #0, 98f\n"
- "st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x9]\n"
- "b 98f\n"
- "92:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x16, #1, 93f\n"
- "str s9, [x13], #0x4\n"
- "str s13, [x9], #0x4\n"
- "tbz x16, #0, 98f\n"
- "st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x9]\n"
- "b 98f\n"
- "93:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x16, #0, 98f\n"
- "str h9, [x13, #0x0]\n"
- "str h13, [x9, #0x0]\n"
- "b 98f\n"
- "94:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x16, #2, 96f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "tbz x16, #1, 95f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "tbz x16, #0, 98f\n"
- "st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x9]\n"
- "b 98f\n"
- "95:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x16, #0, 98f\n"
- "st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x9]\n"
- "b 98f\n"
- "96:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x16, #1, 97f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "tbz x16, #0, 98f\n"
- "st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x9]\n"
- "b 98f\n"
- "97:" // Height 2: Partial direct writeback: partial_1_0
- "str h8, [x13, #0x0]\n"
- "str h12, [x9, #0x0]\n"
- "98:" // Height 2: Partial direct writeback: Done
- "b 100f\n"
- "99:" // Height 2: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "100:" // Height 2: Writeback done
- "subs x16, x16, #0x20\n"
- "bgt 53b\n"
- "b 302f\n"
- "101:" // Height 3
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 102f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #1\n"
- "add x27, x27, x19, LSL #1\n"
- "b 103f\n"
- "102:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "add x27, x9, x19, LSL #1\n"
- "103:" // Height 3: Column loop
- "cbz x14, 104f\n"
- "ldr q8, [x14, #0x0]\n"
+ "97:" // Height 2: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "98:" // Height 2: Writeback done
+ "subs x11, x11, #0x20\n"
+ "bgt 51b\n"
+ "b 296f\n"
+ "99:" // Height 3
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "100:" // Height 3: Column loop
+ "cbz x9, 101f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q10, [x9, #0x20]\n"
+ "ldr q11, [x9, #0x30]\n"
"mov v13.16b, v9.16b\n"
- "add x14, x14, #0x40\n"
+ "add x9, x9, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
- "b 123f\n"
- "104:" // Height 3: no bias
- "tbz %x[flags], #0, 122f\n"
- "cmp x16, #0x20\n"
- "bge 121f\n"
- "tbz x16, #4, 112f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "ld1 { v16.8h }, [x27], #0x10\n"
- "ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x9], #0x10\n"
- "ld1 { v17.8h }, [x27], #0x10\n"
- "tbz x16, #3, 108f\n"
- "ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x9], #0x10\n"
- "ld1 { v18.8h }, [x27], #0x10\n"
- "tbz x16, #2, 106f\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "tbz x16, #1, 105f\n"
- "mov x19, #0x3c\n"
- "ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v19.s }[2], [x27], #0x4\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x9]\n"
- "ld1 { v19.h }[6], [x27]\n"
"b 120f\n"
- "105:" // Height 3: Partial accumulate: partial_1_28
+ "101:" // Height 3: no bias
+ "tbz %x[flags], #0, 119f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x20\n"
+ "add x24, x28, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "bge 118f\n"
+ "tbz x11, #4, 109f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "ld1 { v16.8h }, [x23], #0x10\n"
+ "ld1 { v9.8h }, [x28], #0x10\n"
+ "ld1 { v13.8h }, [x24], #0x10\n"
+ "ld1 { v17.8h }, [x23], #0x10\n"
+ "tbz x11, #3, 105f\n"
+ "ld1 { v10.8h }, [x28], #0x10\n"
+ "ld1 { v14.8h }, [x24], #0x10\n"
+ "ld1 { v18.8h }, [x23], #0x10\n"
+ "tbz x11, #2, 103f\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "tbz x11, #1, 102f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x28], #0x4\n"
+ "ld1 { v15.s }[2], [x24], #0x4\n"
+ "ld1 { v19.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v11.h }[6], [x28]\n"
+ "ld1 { v15.h }[6], [x24]\n"
+ "ld1 { v19.h }[6], [x23]\n"
+ "b 117f\n"
+ "102:" // Height 3: Partial accumulate: partial_1_28
"mov x19, #0x38\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x9]\n"
- "ld1 { v19.h }[4], [x27]\n"
- "b 120f\n"
- "106:" // Height 3: Partial accumulate: partial_2_24
- "tbz x16, #1, 107f\n"
- "ldr s11, [x13], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s19, [x27], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v11.h }[4], [x28]\n"
+ "ld1 { v15.h }[4], [x24]\n"
+ "ld1 { v19.h }[4], [x23]\n"
+ "b 117f\n"
+ "103:" // Height 3: Partial accumulate: partial_2_24
+ "tbz x11, #1, 104f\n"
+ "ldr s11, [x28], #0x4\n"
+ "ldr s15, [x24], #0x4\n"
"mov x19, #0x34\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x9]\n"
- "ld1 { v19.h }[2], [x27]\n"
- "b 120f\n"
- "107:" // Height 3: Partial accumulate: partial_1_24
+ "ldr s19, [x23], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v11.h }[2], [x28]\n"
+ "ld1 { v15.h }[2], [x24]\n"
+ "ld1 { v19.h }[2], [x23]\n"
+ "b 117f\n"
+ "104:" // Height 3: Partial accumulate: partial_1_24
"mov x19, #0x30\n"
- "tbz x16, #0, 120f\n"
- "ldr h11, [x13, #0x0]\n"
- "ldr h15, [x9, #0x0]\n"
- "ldr h19, [x27, #0x0]\n"
- "b 120f\n"
- "108:" // Height 3: Partial accumulate: partial_4_16
- "tbz x16, #2, 110f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "tbz x16, #1, 109f\n"
+ "tbz x11, #0, 117f\n"
+ "ldr h11, [x28, #0x0]\n"
+ "ldr h15, [x24, #0x0]\n"
+ "ldr h19, [x23, #0x0]\n"
+ "b 117f\n"
+ "105:" // Height 3: Partial accumulate: partial_4_16
+ "tbz x11, #2, 107f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
+ "tbz x11, #1, 106f\n"
"mov x19, #0x2c\n"
- "ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x9], #0x4\n"
- "ld1 { v18.s }[2], [x27], #0x4\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x9]\n"
- "ld1 { v18.h }[6], [x27]\n"
- "b 120f\n"
- "109:" // Height 3: Partial accumulate: partial_1_20
+ "ld1 { v10.s }[2], [x28], #0x4\n"
+ "ld1 { v14.s }[2], [x24], #0x4\n"
+ "ld1 { v18.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v10.h }[6], [x28]\n"
+ "ld1 { v14.h }[6], [x24]\n"
+ "ld1 { v18.h }[6], [x23]\n"
+ "b 117f\n"
+ "106:" // Height 3: Partial accumulate: partial_1_20
"mov x19, #0x28\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x9]\n"
- "ld1 { v18.h }[4], [x27]\n"
- "b 120f\n"
- "110:" // Height 3: Partial accumulate: partial_2_16
- "tbz x16, #1, 111f\n"
- "ldr s10, [x13], #0x4\n"
- "ldr s14, [x9], #0x4\n"
- "ldr s18, [x27], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v10.h }[4], [x28]\n"
+ "ld1 { v14.h }[4], [x24]\n"
+ "ld1 { v18.h }[4], [x23]\n"
+ "b 117f\n"
+ "107:" // Height 3: Partial accumulate: partial_2_16
+ "tbz x11, #1, 108f\n"
+ "ldr s10, [x28], #0x4\n"
+ "ldr s14, [x24], #0x4\n"
"mov x19, #0x24\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x9]\n"
- "ld1 { v18.h }[2], [x27]\n"
- "b 120f\n"
- "111:" // Height 3: Partial accumulate: partial_1_16
+ "ldr s18, [x23], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v10.h }[2], [x28]\n"
+ "ld1 { v14.h }[2], [x24]\n"
+ "ld1 { v18.h }[2], [x23]\n"
+ "b 117f\n"
+ "108:" // Height 3: Partial accumulate: partial_1_16
"mov x19, #0x20\n"
- "tbz x16, #0, 120f\n"
- "ldr h10, [x13, #0x0]\n"
- "ldr h14, [x9, #0x0]\n"
- "ldr h18, [x27, #0x0]\n"
- "b 120f\n"
- "112:" // Height 3: Partial accumulate: partial_8_0
- "tbz x16, #3, 116f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "ld1 { v16.8h }, [x27], #0x10\n"
- "tbz x16, #2, 114f\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "tbz x16, #1, 113f\n"
+ "tbz x11, #0, 117f\n"
+ "ldr h10, [x28, #0x0]\n"
+ "ldr h14, [x24, #0x0]\n"
+ "ldr h18, [x23, #0x0]\n"
+ "b 117f\n"
+ "109:" // Height 3: Partial accumulate: partial_8_0
+ "tbz x11, #3, 113f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "ld1 { v16.8h }, [x23], #0x10\n"
+ "tbz x11, #2, 111f\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "tbz x11, #1, 110f\n"
"mov x19, #0x1c\n"
- "ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x9], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x9]\n"
- "ld1 { v17.h }[6], [x27]\n"
- "b 120f\n"
- "113:" // Height 3: Partial accumulate: partial_1_12
+ "ld1 { v9.s }[2], [x28], #0x4\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v17.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v9.h }[6], [x28]\n"
+ "ld1 { v13.h }[6], [x24]\n"
+ "ld1 { v17.h }[6], [x23]\n"
+ "b 117f\n"
+ "110:" // Height 3: Partial accumulate: partial_1_12
"mov x19, #0x18\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x9]\n"
- "ld1 { v17.h }[4], [x27]\n"
- "b 120f\n"
- "114:" // Height 3: Partial accumulate: partial_2_8
- "tbz x16, #1, 115f\n"
- "ldr s9, [x13], #0x4\n"
- "ldr s13, [x9], #0x4\n"
- "ldr s17, [x27], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v9.h }[4], [x28]\n"
+ "ld1 { v13.h }[4], [x24]\n"
+ "ld1 { v17.h }[4], [x23]\n"
+ "b 117f\n"
+ "111:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x11, #1, 112f\n"
+ "ldr s9, [x28], #0x4\n"
+ "ldr s13, [x24], #0x4\n"
"mov x19, #0x14\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x9]\n"
- "ld1 { v17.h }[2], [x27]\n"
- "b 120f\n"
- "115:" // Height 3: Partial accumulate: partial_1_8
+ "ldr s17, [x23], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v9.h }[2], [x28]\n"
+ "ld1 { v13.h }[2], [x24]\n"
+ "ld1 { v17.h }[2], [x23]\n"
+ "b 117f\n"
+ "112:" // Height 3: Partial accumulate: partial_1_8
"mov x19, #0x10\n"
- "tbz x16, #0, 120f\n"
- "ldr h9, [x13, #0x0]\n"
- "ldr h13, [x9, #0x0]\n"
- "ldr h17, [x27, #0x0]\n"
- "b 120f\n"
- "116:" // Height 3: Partial accumulate: partial_4_0
- "tbz x16, #2, 118f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "tbz x16, #1, 117f\n"
+ "tbz x11, #0, 117f\n"
+ "ldr h9, [x28, #0x0]\n"
+ "ldr h13, [x24, #0x0]\n"
+ "ldr h17, [x23, #0x0]\n"
+ "b 117f\n"
+ "113:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x11, #2, 115f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "tbz x11, #1, 114f\n"
"mov x19, #0xc\n"
- "ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x27], #0x4\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x9]\n"
- "ld1 { v16.h }[6], [x27]\n"
- "b 120f\n"
- "117:" // Height 3: Partial accumulate: partial_1_4
+ "ld1 { v8.s }[2], [x28], #0x4\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v16.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v8.h }[6], [x28]\n"
+ "ld1 { v12.h }[6], [x24]\n"
+ "ld1 { v16.h }[6], [x23]\n"
+ "b 117f\n"
+ "114:" // Height 3: Partial accumulate: partial_1_4
"mov x19, #0x8\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x9]\n"
- "ld1 { v16.h }[4], [x27]\n"
- "b 120f\n"
- "118:" // Height 3: Partial accumulate: partial_2_0
- "tbz x16, #1, 119f\n"
- "ldr s8, [x13], #0x4\n"
- "ldr s12, [x9], #0x4\n"
- "ldr s16, [x27], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v8.h }[4], [x28]\n"
+ "ld1 { v12.h }[4], [x24]\n"
+ "ld1 { v16.h }[4], [x23]\n"
+ "b 117f\n"
+ "115:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x11, #1, 116f\n"
+ "ldr s8, [x28], #0x4\n"
+ "ldr s12, [x24], #0x4\n"
"mov x19, #0x4\n"
- "tbz x16, #0, 120f\n"
- "ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x9]\n"
- "ld1 { v16.h }[2], [x27]\n"
- "b 120f\n"
- "119:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s16, [x23], #0x4\n"
+ "tbz x11, #0, 117f\n"
+ "ld1 { v8.h }[2], [x28]\n"
+ "ld1 { v12.h }[2], [x24]\n"
+ "ld1 { v16.h }[2], [x23]\n"
+ "b 117f\n"
+ "116:" // Height 3: Partial accumulate: partial_1_0
+ "ldr h8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr h8, [x13, #0x0]\n"
- "ldr h12, [x9, #0x0]\n"
- "ldr h16, [x27, #0x0]\n"
- "120:" // Height 3: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "b 123f\n"
- "121:" // Height 3: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "b 123f\n"
- "122:" // Height 3: no accumulate
+ "ldr h12, [x24, #0x0]\n"
+ "ldr h16, [x23, #0x0]\n"
+ "117:" // Height 3: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 120f\n"
+ "118:" // Height 3: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "b 120f\n"
+ "119:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -1413,349 +1390,352 @@ void a64_hybrid_fp16_mla_6x32 (
"movi v17.16b, #0x0\n"
"movi v18.16b, #0x0\n"
"movi v19.16b, #0x0\n"
- "123:" // Height 3: setup done
- "mov x12, #0x0\n"
- "124:" // Height 3: String loop
+ "120:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "121:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 125f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 122f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 126f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 123f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
- "b 126f\n"
- "125:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "126:" // Height 3: input setup done
- "cmp x11, #0x8\n"
- "blt 129f\n"
- "cmp x11, #0x10\n"
- "blt 128f\n"
- "127:" // Height 3: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "b 123f\n"
+ "122:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "123:" // Height 3: input setup done
+ "cmp x26, #0x8\n"
+ "blt 126f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 125f\n"
+ "124:" // Height 3: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x23, x23, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x26, x26, #0x8\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "cmp x26, #0x10\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "sub x11, x11, #0x8\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
- "cmp x11, #0x10\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
- "add x15, x15, #0x200\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.8h, v7.8h, v1.h[7]\n"
+ "ldr q1, [x24, #0x0]\n"
"fmla v19.8h, v7.8h, v2.h[7]\n"
- "bge 127b\n"
- "128:" // Height 3: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "bge 124b\n"
+ "125:" // Height 3: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "add x24, x24, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x23, x23, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
- "add x15, x15, #0x200\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
"fmla v11.8h, v7.8h, v0.h[7]\n"
"fmla v15.8h, v7.8h, v1.h[7]\n"
"fmla v19.8h, v7.8h, v2.h[7]\n"
- "129:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 131f\n"
- "130:" // Height 3: Multiply loop: Odd block loop
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr q6, [x15, #0x0]\n"
+ "126:" // Height 3: Multiply loop: Main loop skip
+ "cbz x26, 128f\n"
+ "127:" // Height 3: Multiply loop: Odd block loop
+ "ldr h0, [x25], #0x2\n"
+ "sub x26, x26, #0x1\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "sub x11, x11, #0x1\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
- "cbnz x11, 130b\n"
- "131:" // Height 3: Multiply loop: No odd multiplies
+ "cbnz x26, 127b\n"
+ "128:" // Height 3: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 124b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "tbz %x[flags], #1, 132f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 121b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #1\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 129f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.8h }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1784,217 +1764,200 @@ void a64_hybrid_fp16_mla_6x32 (
"fmin v19.8h, v19.8h, v0.8h\n"
"fmax v18.8h, v18.8h, v1.8h\n"
"fmax v19.8h, v19.8h, v1.8h\n"
- "132:" // Height 3: No activation
- "cmp x16, #0x20\n"
- "bge 149f\n"
- "tbz x16, #4, 140f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "st1 { v13.8h }, [x9], #0x10\n"
- "st1 { v16.8h }, [x27], #0x10\n"
- "st1 { v17.8h }, [x27], #0x10\n"
- "tbz x16, #3, 136f\n"
- "st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x9], #0x10\n"
- "st1 { v18.8h }, [x27], #0x10\n"
- "tbz x16, #2, 134f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "tbz x16, #1, 133f\n"
- "st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x9], #0x4\n"
- "st1 { v19.s }[2], [x27], #0x4\n"
- "tbz x16, #0, 148f\n"
- "st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x9]\n"
- "st1 { v19.h }[6], [x27]\n"
- "b 148f\n"
- "133:" // Height 3: Partial direct writeback: partial_1_28
- "tbz x16, #0, 148f\n"
- "st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x9]\n"
- "st1 { v19.h }[4], [x27]\n"
- "b 148f\n"
- "134:" // Height 3: Partial direct writeback: partial_2_24
- "tbz x16, #1, 135f\n"
- "str s11, [x13], #0x4\n"
- "str s15, [x9], #0x4\n"
- "str s19, [x27], #0x4\n"
- "tbz x16, #0, 148f\n"
- "st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x9]\n"
- "st1 { v19.h }[2], [x27]\n"
- "b 148f\n"
- "135:" // Height 3: Partial direct writeback: partial_1_24
- "tbz x16, #0, 148f\n"
- "str h11, [x13, #0x0]\n"
- "str h15, [x9, #0x0]\n"
- "str h19, [x27, #0x0]\n"
- "b 148f\n"
- "136:" // Height 3: Partial direct writeback: partial_4_16
- "tbz x16, #2, 138f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "tbz x16, #1, 137f\n"
- "st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x9], #0x4\n"
- "st1 { v18.s }[2], [x27], #0x4\n"
- "tbz x16, #0, 148f\n"
- "st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x9]\n"
- "st1 { v18.h }[6], [x27]\n"
- "b 148f\n"
- "137:" // Height 3: Partial direct writeback: partial_1_20
- "tbz x16, #0, 148f\n"
- "st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x9]\n"
- "st1 { v18.h }[4], [x27]\n"
- "b 148f\n"
- "138:" // Height 3: Partial direct writeback: partial_2_16
- "tbz x16, #1, 139f\n"
- "str s10, [x13], #0x4\n"
- "str s14, [x9], #0x4\n"
- "str s18, [x27], #0x4\n"
- "tbz x16, #0, 148f\n"
- "st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x9]\n"
- "st1 { v18.h }[2], [x27]\n"
- "b 148f\n"
- "139:" // Height 3: Partial direct writeback: partial_1_16
- "tbz x16, #0, 148f\n"
- "str h10, [x13, #0x0]\n"
- "str h14, [x9, #0x0]\n"
- "str h18, [x27, #0x0]\n"
- "b 148f\n"
- "140:" // Height 3: Partial direct writeback: partial_8_0
- "tbz x16, #3, 144f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "st1 { v16.8h }, [x27], #0x10\n"
- "tbz x16, #2, 142f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "tbz x16, #1, 141f\n"
- "st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x9], #0x4\n"
- "st1 { v17.s }[2], [x27], #0x4\n"
- "tbz x16, #0, 148f\n"
- "st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x9]\n"
- "st1 { v17.h }[6], [x27]\n"
- "b 148f\n"
- "141:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x16, #0, 148f\n"
- "st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x9]\n"
- "st1 { v17.h }[4], [x27]\n"
- "b 148f\n"
- "142:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x16, #1, 143f\n"
- "str s9, [x13], #0x4\n"
- "str s13, [x9], #0x4\n"
- "str s17, [x27], #0x4\n"
- "tbz x16, #0, 148f\n"
- "st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x9]\n"
- "st1 { v17.h }[2], [x27]\n"
- "b 148f\n"
- "143:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x16, #0, 148f\n"
- "str h9, [x13, #0x0]\n"
- "str h13, [x9, #0x0]\n"
- "str h17, [x27, #0x0]\n"
- "b 148f\n"
- "144:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x16, #2, 146f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "tbz x16, #1, 145f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "tbz x16, #0, 148f\n"
- "st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x9]\n"
- "st1 { v16.h }[6], [x27]\n"
- "b 148f\n"
- "145:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x16, #0, 148f\n"
- "st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x9]\n"
- "st1 { v16.h }[4], [x27]\n"
- "b 148f\n"
- "146:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x16, #1, 147f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "str s16, [x27], #0x4\n"
- "tbz x16, #0, 148f\n"
- "st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x9]\n"
- "st1 { v16.h }[2], [x27]\n"
- "b 148f\n"
- "147:" // Height 3: Partial direct writeback: partial_1_0
- "str h8, [x13, #0x0]\n"
- "str h12, [x9, #0x0]\n"
- "str h16, [x27, #0x0]\n"
- "148:" // Height 3: Partial direct writeback: Done
- "b 150f\n"
- "149:" // Height 3: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "150:" // Height 3: Writeback done
- "subs x16, x16, #0x20\n"
- "bgt 103b\n"
- "b 302f\n"
- "151:" // Height 4
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 152f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #1\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #1\n"
- "add x25, x25, x19, LSL #1\n"
- "b 153f\n"
- "152:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "add x27, x9, x19, LSL #1\n"
- "add x25, x27, x19, LSL #1\n"
- "153:" // Height 4: Column loop
- "cbz x14, 154f\n"
- "ldr q8, [x14, #0x0]\n"
+ "129:" // Height 3: No activation
+ "cmp x11, #0x20\n"
+ "bge 146f\n"
+ "tbz x11, #4, 137f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v9.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "st1 { v13.8h }, [x24], #0x10\n"
+ "st1 { v16.8h }, [x23], #0x10\n"
+ "st1 { v17.8h }, [x23], #0x10\n"
+ "tbz x11, #3, 133f\n"
+ "st1 { v10.8h }, [x28], #0x10\n"
+ "st1 { v14.8h }, [x24], #0x10\n"
+ "st1 { v18.8h }, [x23], #0x10\n"
+ "tbz x11, #2, 131f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "tbz x11, #1, 130f\n"
+ "st1 { v11.s }[2], [x28], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v19.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 145f\n"
+ "st1 { v11.h }[6], [x28]\n"
+ "st1 { v15.h }[6], [x24]\n"
+ "st1 { v19.h }[6], [x23]\n"
+ "b 145f\n"
+ "130:" // Height 3: Partial direct writeback: partial_1_28
+ "tbz x11, #0, 145f\n"
+ "st1 { v11.h }[4], [x28]\n"
+ "st1 { v15.h }[4], [x24]\n"
+ "st1 { v19.h }[4], [x23]\n"
+ "b 145f\n"
+ "131:" // Height 3: Partial direct writeback: partial_2_24
+ "tbz x11, #1, 132f\n"
+ "str s11, [x28], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s19, [x23], #0x4\n"
+ "tbz x11, #0, 145f\n"
+ "st1 { v11.h }[2], [x28]\n"
+ "st1 { v15.h }[2], [x24]\n"
+ "st1 { v19.h }[2], [x23]\n"
+ "b 145f\n"
+ "132:" // Height 3: Partial direct writeback: partial_1_24
+ "tbz x11, #0, 145f\n"
+ "str h11, [x28, #0x0]\n"
+ "str h15, [x24, #0x0]\n"
+ "str h19, [x23, #0x0]\n"
+ "b 145f\n"
+ "133:" // Height 3: Partial direct writeback: partial_4_16
+ "tbz x11, #2, 135f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "tbz x11, #1, 134f\n"
+ "st1 { v10.s }[2], [x28], #0x4\n"
+ "st1 { v14.s }[2], [x24], #0x4\n"
+ "st1 { v18.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 145f\n"
+ "st1 { v10.h }[6], [x28]\n"
+ "st1 { v14.h }[6], [x24]\n"
+ "st1 { v18.h }[6], [x23]\n"
+ "b 145f\n"
+ "134:" // Height 3: Partial direct writeback: partial_1_20
+ "tbz x11, #0, 145f\n"
+ "st1 { v10.h }[4], [x28]\n"
+ "st1 { v14.h }[4], [x24]\n"
+ "st1 { v18.h }[4], [x23]\n"
+ "b 145f\n"
+ "135:" // Height 3: Partial direct writeback: partial_2_16
+ "tbz x11, #1, 136f\n"
+ "str s10, [x28], #0x4\n"
+ "str s14, [x24], #0x4\n"
+ "str s18, [x23], #0x4\n"
+ "tbz x11, #0, 145f\n"
+ "st1 { v10.h }[2], [x28]\n"
+ "st1 { v14.h }[2], [x24]\n"
+ "st1 { v18.h }[2], [x23]\n"
+ "b 145f\n"
+ "136:" // Height 3: Partial direct writeback: partial_1_16
+ "tbz x11, #0, 145f\n"
+ "str h10, [x28, #0x0]\n"
+ "str h14, [x24, #0x0]\n"
+ "str h18, [x23, #0x0]\n"
+ "b 145f\n"
+ "137:" // Height 3: Partial direct writeback: partial_8_0
+ "tbz x11, #3, 141f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "st1 { v16.8h }, [x23], #0x10\n"
+ "tbz x11, #2, 139f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "tbz x11, #1, 138f\n"
+ "st1 { v9.s }[2], [x28], #0x4\n"
+ "st1 { v13.s }[2], [x24], #0x4\n"
+ "st1 { v17.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 145f\n"
+ "st1 { v9.h }[6], [x28]\n"
+ "st1 { v13.h }[6], [x24]\n"
+ "st1 { v17.h }[6], [x23]\n"
+ "b 145f\n"
+ "138:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 145f\n"
+ "st1 { v9.h }[4], [x28]\n"
+ "st1 { v13.h }[4], [x24]\n"
+ "st1 { v17.h }[4], [x23]\n"
+ "b 145f\n"
+ "139:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 140f\n"
+ "str s9, [x28], #0x4\n"
+ "str s13, [x24], #0x4\n"
+ "str s17, [x23], #0x4\n"
+ "tbz x11, #0, 145f\n"
+ "st1 { v9.h }[2], [x28]\n"
+ "st1 { v13.h }[2], [x24]\n"
+ "st1 { v17.h }[2], [x23]\n"
+ "b 145f\n"
+ "140:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 145f\n"
+ "str h9, [x28, #0x0]\n"
+ "str h13, [x24, #0x0]\n"
+ "str h17, [x23, #0x0]\n"
+ "b 145f\n"
+ "141:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 143f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "tbz x11, #1, 142f\n"
+ "st1 { v8.s }[2], [x28], #0x4\n"
+ "st1 { v12.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 145f\n"
+ "st1 { v8.h }[6], [x28]\n"
+ "st1 { v12.h }[6], [x24]\n"
+ "st1 { v16.h }[6], [x23]\n"
+ "b 145f\n"
+ "142:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 145f\n"
+ "st1 { v8.h }[4], [x28]\n"
+ "st1 { v12.h }[4], [x24]\n"
+ "st1 { v16.h }[4], [x23]\n"
+ "b 145f\n"
+ "143:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 144f\n"
+ "str s8, [x28], #0x4\n"
+ "str s12, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "tbz x11, #0, 145f\n"
+ "st1 { v8.h }[2], [x28]\n"
+ "st1 { v12.h }[2], [x24]\n"
+ "st1 { v16.h }[2], [x23]\n"
+ "b 145f\n"
+ "144:" // Height 3: Partial direct writeback: partial_1_0
+ "str h8, [x28, #0x0]\n"
+ "str h12, [x24, #0x0]\n"
+ "str h16, [x23, #0x0]\n"
+ "145:" // Height 3: Partial direct writeback: Done
+ "b 147f\n"
+ "146:" // Height 3: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "147:" // Height 3: Writeback done
+ "subs x11, x11, #0x20\n"
+ "bgt 100b\n"
+ "b 296f\n"
+ "148:" // Height 4
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "149:" // Height 4: Column loop
+ "cbz x9, 150f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v20.16b, v8.16b\n"
- "ldr q11, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v13.16b, v9.16b\n"
"mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
@@ -2004,240 +1967,241 @@ void a64_hybrid_fp16_mla_6x32 (
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
- "b 173f\n"
- "154:" // Height 4: no bias
- "tbz %x[flags], #0, 172f\n"
- "cmp x16, #0x20\n"
- "bge 171f\n"
- "tbz x16, #4, 162f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "ld1 { v16.8h }, [x27], #0x10\n"
- "ld1 { v20.8h }, [x25], #0x10\n"
- "ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x9], #0x10\n"
- "ld1 { v17.8h }, [x27], #0x10\n"
- "ld1 { v21.8h }, [x25], #0x10\n"
- "tbz x16, #3, 158f\n"
- "ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x9], #0x10\n"
- "ld1 { v18.8h }, [x27], #0x10\n"
- "ld1 { v22.8h }, [x25], #0x10\n"
- "tbz x16, #2, 156f\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "tbz x16, #1, 155f\n"
+ "b 169f\n"
+ "150:" // Height 4: no bias
+ "tbz %x[flags], #0, 168f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x20\n"
+ "add x24, x28, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "bge 167f\n"
+ "tbz x11, #4, 158f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "ld1 { v16.8h }, [x23], #0x10\n"
+ "ld1 { v20.8h }, [x22], #0x10\n"
+ "ld1 { v9.8h }, [x28], #0x10\n"
+ "ld1 { v13.8h }, [x24], #0x10\n"
+ "ld1 { v17.8h }, [x23], #0x10\n"
+ "ld1 { v21.8h }, [x22], #0x10\n"
+ "tbz x11, #3, 154f\n"
+ "ld1 { v10.8h }, [x28], #0x10\n"
+ "ld1 { v14.8h }, [x24], #0x10\n"
+ "ld1 { v18.8h }, [x23], #0x10\n"
+ "ld1 { v22.8h }, [x22], #0x10\n"
+ "tbz x11, #2, 152f\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "tbz x11, #1, 151f\n"
"mov x19, #0x3c\n"
- "ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v19.s }[2], [x27], #0x4\n"
- "ld1 { v23.s }[2], [x25], #0x4\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x9]\n"
- "ld1 { v19.h }[6], [x27]\n"
- "ld1 { v23.h }[6], [x25]\n"
- "b 170f\n"
- "155:" // Height 4: Partial accumulate: partial_1_28
+ "ld1 { v11.s }[2], [x28], #0x4\n"
+ "ld1 { v15.s }[2], [x24], #0x4\n"
+ "ld1 { v19.s }[2], [x23], #0x4\n"
+ "ld1 { v23.s }[2], [x22], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v11.h }[6], [x28]\n"
+ "ld1 { v15.h }[6], [x24]\n"
+ "ld1 { v19.h }[6], [x23]\n"
+ "ld1 { v23.h }[6], [x22]\n"
+ "b 166f\n"
+ "151:" // Height 4: Partial accumulate: partial_1_28
"mov x19, #0x38\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x9]\n"
- "ld1 { v19.h }[4], [x27]\n"
- "ld1 { v23.h }[4], [x25]\n"
- "b 170f\n"
- "156:" // Height 4: Partial accumulate: partial_2_24
- "tbz x16, #1, 157f\n"
- "ldr s11, [x13], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s19, [x27], #0x4\n"
- "ldr s23, [x25], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v11.h }[4], [x28]\n"
+ "ld1 { v15.h }[4], [x24]\n"
+ "ld1 { v19.h }[4], [x23]\n"
+ "ld1 { v23.h }[4], [x22]\n"
+ "b 166f\n"
+ "152:" // Height 4: Partial accumulate: partial_2_24
+ "tbz x11, #1, 153f\n"
+ "ldr s11, [x28], #0x4\n"
+ "ldr s15, [x24], #0x4\n"
"mov x19, #0x34\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x9]\n"
- "ld1 { v19.h }[2], [x27]\n"
- "ld1 { v23.h }[2], [x25]\n"
- "b 170f\n"
- "157:" // Height 4: Partial accumulate: partial_1_24
+ "ldr s19, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v11.h }[2], [x28]\n"
+ "ld1 { v15.h }[2], [x24]\n"
+ "ld1 { v19.h }[2], [x23]\n"
+ "ld1 { v23.h }[2], [x22]\n"
+ "b 166f\n"
+ "153:" // Height 4: Partial accumulate: partial_1_24
"mov x19, #0x30\n"
- "tbz x16, #0, 170f\n"
- "ldr h11, [x13, #0x0]\n"
- "ldr h15, [x9, #0x0]\n"
- "ldr h19, [x27, #0x0]\n"
- "ldr h23, [x25, #0x0]\n"
- "b 170f\n"
- "158:" // Height 4: Partial accumulate: partial_4_16
- "tbz x16, #2, 160f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "tbz x16, #1, 159f\n"
- "ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x9], #0x4\n"
- "ld1 { v18.s }[2], [x27], #0x4\n"
- "ld1 { v22.s }[2], [x25], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ldr h11, [x28, #0x0]\n"
+ "ldr h15, [x24, #0x0]\n"
+ "ldr h19, [x23, #0x0]\n"
+ "ldr h23, [x22, #0x0]\n"
+ "b 166f\n"
+ "154:" // Height 4: Partial accumulate: partial_4_16
+ "tbz x11, #2, 156f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "tbz x11, #1, 155f\n"
"mov x19, #0x2c\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x9]\n"
- "ld1 { v18.h }[6], [x27]\n"
- "ld1 { v22.h }[6], [x25]\n"
- "b 170f\n"
- "159:" // Height 4: Partial accumulate: partial_1_20
+ "ld1 { v10.s }[2], [x28], #0x4\n"
+ "ld1 { v14.s }[2], [x24], #0x4\n"
+ "ld1 { v18.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v10.h }[6], [x28]\n"
+ "ld1 { v14.h }[6], [x24]\n"
+ "ld1 { v18.h }[6], [x23]\n"
+ "ld1 { v22.h }[6], [x22]\n"
+ "b 166f\n"
+ "155:" // Height 4: Partial accumulate: partial_1_20
"mov x19, #0x28\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x9]\n"
- "ld1 { v18.h }[4], [x27]\n"
- "ld1 { v22.h }[4], [x25]\n"
- "b 170f\n"
- "160:" // Height 4: Partial accumulate: partial_2_16
- "tbz x16, #1, 161f\n"
- "ldr s10, [x13], #0x4\n"
- "ldr s14, [x9], #0x4\n"
- "ldr s18, [x27], #0x4\n"
- "ldr s22, [x25], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v10.h }[4], [x28]\n"
+ "ld1 { v14.h }[4], [x24]\n"
+ "ld1 { v18.h }[4], [x23]\n"
+ "ld1 { v22.h }[4], [x22]\n"
+ "b 166f\n"
+ "156:" // Height 4: Partial accumulate: partial_2_16
+ "tbz x11, #1, 157f\n"
+ "ldr s10, [x28], #0x4\n"
+ "ldr s14, [x24], #0x4\n"
"mov x19, #0x24\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x9]\n"
- "ld1 { v18.h }[2], [x27]\n"
- "ld1 { v22.h }[2], [x25]\n"
- "b 170f\n"
- "161:" // Height 4: Partial accumulate: partial_1_16
+ "ldr s18, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v10.h }[2], [x28]\n"
+ "ld1 { v14.h }[2], [x24]\n"
+ "ld1 { v18.h }[2], [x23]\n"
+ "ld1 { v22.h }[2], [x22]\n"
+ "b 166f\n"
+ "157:" // Height 4: Partial accumulate: partial_1_16
"mov x19, #0x20\n"
- "tbz x16, #0, 170f\n"
- "ldr h10, [x13, #0x0]\n"
- "ldr h14, [x9, #0x0]\n"
- "ldr h18, [x27, #0x0]\n"
- "ldr h22, [x25, #0x0]\n"
- "b 170f\n"
- "162:" // Height 4: Partial accumulate: partial_8_0
- "tbz x16, #3, 166f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "ld1 { v16.8h }, [x27], #0x10\n"
- "ld1 { v20.8h }, [x25], #0x10\n"
- "tbz x16, #2, 164f\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "tbz x16, #1, 163f\n"
+ "tbz x11, #0, 166f\n"
+ "ldr h10, [x28, #0x0]\n"
+ "ldr h14, [x24, #0x0]\n"
+ "ldr h18, [x23, #0x0]\n"
+ "ldr h22, [x22, #0x0]\n"
+ "b 166f\n"
+ "158:" // Height 4: Partial accumulate: partial_8_0
+ "tbz x11, #3, 162f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "ld1 { v16.8h }, [x23], #0x10\n"
+ "ld1 { v20.8h }, [x22], #0x10\n"
+ "tbz x11, #2, 160f\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "tbz x11, #1, 159f\n"
"mov x19, #0x1c\n"
- "ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x9], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x9]\n"
- "ld1 { v17.h }[6], [x27]\n"
- "ld1 { v21.h }[6], [x25]\n"
- "b 170f\n"
- "163:" // Height 4: Partial accumulate: partial_1_12
+ "ld1 { v9.s }[2], [x28], #0x4\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v17.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v9.h }[6], [x28]\n"
+ "ld1 { v13.h }[6], [x24]\n"
+ "ld1 { v17.h }[6], [x23]\n"
+ "ld1 { v21.h }[6], [x22]\n"
+ "b 166f\n"
+ "159:" // Height 4: Partial accumulate: partial_1_12
"mov x19, #0x18\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x9]\n"
- "ld1 { v17.h }[4], [x27]\n"
- "ld1 { v21.h }[4], [x25]\n"
- "b 170f\n"
- "164:" // Height 4: Partial accumulate: partial_2_8
- "tbz x16, #1, 165f\n"
- "ldr s9, [x13], #0x4\n"
- "ldr s13, [x9], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s21, [x25], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v9.h }[4], [x28]\n"
+ "ld1 { v13.h }[4], [x24]\n"
+ "ld1 { v17.h }[4], [x23]\n"
+ "ld1 { v21.h }[4], [x22]\n"
+ "b 166f\n"
+ "160:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x11, #1, 161f\n"
+ "ldr s9, [x28], #0x4\n"
+ "ldr s13, [x24], #0x4\n"
"mov x19, #0x14\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x9]\n"
- "ld1 { v17.h }[2], [x27]\n"
- "ld1 { v21.h }[2], [x25]\n"
- "b 170f\n"
- "165:" // Height 4: Partial accumulate: partial_1_8
+ "ldr s17, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v9.h }[2], [x28]\n"
+ "ld1 { v13.h }[2], [x24]\n"
+ "ld1 { v17.h }[2], [x23]\n"
+ "ld1 { v21.h }[2], [x22]\n"
+ "b 166f\n"
+ "161:" // Height 4: Partial accumulate: partial_1_8
"mov x19, #0x10\n"
- "tbz x16, #0, 170f\n"
- "ldr h9, [x13, #0x0]\n"
- "ldr h13, [x9, #0x0]\n"
- "ldr h17, [x27, #0x0]\n"
- "ldr h21, [x25, #0x0]\n"
- "b 170f\n"
- "166:" // Height 4: Partial accumulate: partial_4_0
- "tbz x16, #2, 168f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "tbz x16, #1, 167f\n"
- "ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x27], #0x4\n"
- "ld1 { v20.s }[2], [x25], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ldr h9, [x28, #0x0]\n"
+ "ldr h13, [x24, #0x0]\n"
+ "ldr h17, [x23, #0x0]\n"
+ "ldr h21, [x22, #0x0]\n"
+ "b 166f\n"
+ "162:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x11, #2, 164f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "tbz x11, #1, 163f\n"
"mov x19, #0xc\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x9]\n"
- "ld1 { v16.h }[6], [x27]\n"
- "ld1 { v20.h }[6], [x25]\n"
- "b 170f\n"
- "167:" // Height 4: Partial accumulate: partial_1_4
+ "ld1 { v8.s }[2], [x28], #0x4\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v16.s }[2], [x23], #0x4\n"
+ "ld1 { v20.s }[2], [x22], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v8.h }[6], [x28]\n"
+ "ld1 { v12.h }[6], [x24]\n"
+ "ld1 { v16.h }[6], [x23]\n"
+ "ld1 { v20.h }[6], [x22]\n"
+ "b 166f\n"
+ "163:" // Height 4: Partial accumulate: partial_1_4
"mov x19, #0x8\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x9]\n"
- "ld1 { v16.h }[4], [x27]\n"
- "ld1 { v20.h }[4], [x25]\n"
- "b 170f\n"
- "168:" // Height 4: Partial accumulate: partial_2_0
- "tbz x16, #1, 169f\n"
- "ldr s8, [x13], #0x4\n"
- "ldr s12, [x9], #0x4\n"
- "ldr s16, [x27], #0x4\n"
- "ldr s20, [x25], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v8.h }[4], [x28]\n"
+ "ld1 { v12.h }[4], [x24]\n"
+ "ld1 { v16.h }[4], [x23]\n"
+ "ld1 { v20.h }[4], [x22]\n"
+ "b 166f\n"
+ "164:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x11, #1, 165f\n"
+ "ldr s8, [x28], #0x4\n"
+ "ldr s12, [x24], #0x4\n"
"mov x19, #0x4\n"
- "tbz x16, #0, 170f\n"
- "ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x9]\n"
- "ld1 { v16.h }[2], [x27]\n"
- "ld1 { v20.h }[2], [x25]\n"
- "b 170f\n"
- "169:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s16, [x23], #0x4\n"
+ "ldr s20, [x22], #0x4\n"
+ "tbz x11, #0, 166f\n"
+ "ld1 { v8.h }[2], [x28]\n"
+ "ld1 { v12.h }[2], [x24]\n"
+ "ld1 { v16.h }[2], [x23]\n"
+ "ld1 { v20.h }[2], [x22]\n"
+ "b 166f\n"
+ "165:" // Height 4: Partial accumulate: partial_1_0
+ "ldr h8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr h8, [x13, #0x0]\n"
- "ldr h12, [x9, #0x0]\n"
- "ldr h16, [x27, #0x0]\n"
- "ldr h20, [x25, #0x0]\n"
- "170:" // Height 4: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "b 173f\n"
- "171:" // Height 4: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "b 173f\n"
- "172:" // Height 4: no accumulate
+ "ldr h12, [x24, #0x0]\n"
+ "ldr h16, [x23, #0x0]\n"
+ "ldr h20, [x22, #0x0]\n"
+ "166:" // Height 4: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 169f\n"
+ "167:" // Height 4: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "b 169f\n"
+ "168:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -2254,380 +2218,380 @@ void a64_hybrid_fp16_mla_6x32 (
"movi v21.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"movi v23.16b, #0x0\n"
- "173:" // Height 4: setup done
- "mov x12, #0x0\n"
- "174:" // Height 4: String loop
+ "169:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "170:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 175f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 171f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 176f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 172f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
- "b 176f\n"
- "175:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "176:" // Height 4: input setup done
- "cmp x11, #0x8\n"
- "blt 179f\n"
- "cmp x11, #0x10\n"
- "blt 178f\n"
- "177:" // Height 4: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x23, x23, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 172f\n"
+ "171:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "172:" // Height 4: input setup done
+ "cmp x26, #0x8\n"
+ "blt 175f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 174f\n"
+ "173:" // Height 4: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x22, x22, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x26, x26, #0x8\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x10\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x11, x11, #0x8\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "cmp x11, #0x10\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
"fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
"fmla v21.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
"fmla v22.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
"fmla v23.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
"fmla v20.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
"fmla v21.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
"fmla v22.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
"fmla v23.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
"fmla v20.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
"fmla v21.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
"fmla v22.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
"fmla v23.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
"fmla v20.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
"fmla v21.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
"fmla v22.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
"fmla v23.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
"fmla v20.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
"fmla v21.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
"fmla v22.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
"fmla v23.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
"fmla v20.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
"fmla v21.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
"fmla v22.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
"fmla v23.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
"fmla v20.8h, v6.8h, v3.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
"fmla v21.8h, v7.8h, v3.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
- "add x15, x15, #0x200\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
"fmla v22.8h, v6.8h, v3.h[7]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.8h, v7.8h, v1.h[7]\n"
+ "ldr q1, [x24, #0x0]\n"
"fmla v19.8h, v7.8h, v2.h[7]\n"
+ "ldr q2, [x23, #0x0]\n"
"fmla v23.8h, v7.8h, v3.h[7]\n"
- "bge 177b\n"
- "178:" // Height 4: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "bge 173b\n"
+ "174:" // Height 4: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x22, x22, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
"fmla v20.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
"fmla v21.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
"fmla v22.8h, v6.8h, v3.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
"fmla v23.8h, v7.8h, v3.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
"fmla v20.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
"fmla v21.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
"fmla v22.8h, v6.8h, v3.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
"fmla v23.8h, v7.8h, v3.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
"fmla v20.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
"fmla v21.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
"fmla v22.8h, v6.8h, v3.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
"fmla v23.8h, v7.8h, v3.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
"fmla v20.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
"fmla v21.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
"fmla v22.8h, v6.8h, v3.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
"fmla v23.8h, v7.8h, v3.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
"fmla v20.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
"fmla v21.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
"fmla v22.8h, v6.8h, v3.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
"fmla v23.8h, v7.8h, v3.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
"fmla v20.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
"fmla v21.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
"fmla v22.8h, v6.8h, v3.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
"fmla v23.8h, v7.8h, v3.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
"fmla v20.8h, v6.8h, v3.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
"fmla v21.8h, v7.8h, v3.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
- "add x15, x15, #0x200\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
@@ -2636,28 +2600,28 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v15.8h, v7.8h, v1.h[7]\n"
"fmla v19.8h, v7.8h, v2.h[7]\n"
"fmla v23.8h, v7.8h, v3.h[7]\n"
- "179:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 181f\n"
- "180:" // Height 4: Multiply loop: Odd block loop
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr q6, [x15, #0x0]\n"
+ "175:" // Height 4: Multiply loop: Main loop skip
+ "cbz x26, 177f\n"
+ "176:" // Height 4: Multiply loop: Odd block loop
+ "ldr h0, [x25], #0x2\n"
+ "sub x26, x26, #0x1\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "sub x11, x11, #0x1\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
@@ -2665,17 +2629,21 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
- "cbnz x11, 180b\n"
- "181:" // Height 4: Multiply loop: No odd multiplies
+ "cbnz x26, 176b\n"
+ "177:" // Height 4: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 174b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "tbz %x[flags], #1, 182f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 170b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #1\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #1\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "tbz %x[flags], #1, 178f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.8h }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -2712,258 +2680,237 @@ void a64_hybrid_fp16_mla_6x32 (
"fmax v21.8h, v21.8h, v1.8h\n"
"fmax v22.8h, v22.8h, v1.8h\n"
"fmax v23.8h, v23.8h, v1.8h\n"
- "182:" // Height 4: No activation
- "cmp x16, #0x20\n"
- "bge 199f\n"
- "tbz x16, #4, 190f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "st1 { v13.8h }, [x9], #0x10\n"
- "st1 { v16.8h }, [x27], #0x10\n"
- "st1 { v17.8h }, [x27], #0x10\n"
- "st1 { v20.8h }, [x25], #0x10\n"
- "st1 { v21.8h }, [x25], #0x10\n"
- "tbz x16, #3, 186f\n"
- "st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x9], #0x10\n"
- "st1 { v18.8h }, [x27], #0x10\n"
- "st1 { v22.8h }, [x25], #0x10\n"
- "tbz x16, #2, 184f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "tbz x16, #1, 183f\n"
- "st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x9], #0x4\n"
- "st1 { v19.s }[2], [x27], #0x4\n"
- "st1 { v23.s }[2], [x25], #0x4\n"
- "tbz x16, #0, 198f\n"
- "st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x9]\n"
- "st1 { v19.h }[6], [x27]\n"
- "st1 { v23.h }[6], [x25]\n"
- "b 198f\n"
- "183:" // Height 4: Partial direct writeback: partial_1_28
- "tbz x16, #0, 198f\n"
- "st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x9]\n"
- "st1 { v19.h }[4], [x27]\n"
- "st1 { v23.h }[4], [x25]\n"
- "b 198f\n"
- "184:" // Height 4: Partial direct writeback: partial_2_24
- "tbz x16, #1, 185f\n"
- "str s11, [x13], #0x4\n"
- "str s15, [x9], #0x4\n"
- "str s19, [x27], #0x4\n"
- "str s23, [x25], #0x4\n"
- "tbz x16, #0, 198f\n"
- "st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x9]\n"
- "st1 { v19.h }[2], [x27]\n"
- "st1 { v23.h }[2], [x25]\n"
- "b 198f\n"
- "185:" // Height 4: Partial direct writeback: partial_1_24
- "tbz x16, #0, 198f\n"
- "str h11, [x13, #0x0]\n"
- "str h15, [x9, #0x0]\n"
- "str h19, [x27, #0x0]\n"
- "str h23, [x25, #0x0]\n"
- "b 198f\n"
- "186:" // Height 4: Partial direct writeback: partial_4_16
- "tbz x16, #2, 188f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "tbz x16, #1, 187f\n"
- "st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x9], #0x4\n"
- "st1 { v18.s }[2], [x27], #0x4\n"
- "st1 { v22.s }[2], [x25], #0x4\n"
- "tbz x16, #0, 198f\n"
- "st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x9]\n"
- "st1 { v18.h }[6], [x27]\n"
- "st1 { v22.h }[6], [x25]\n"
- "b 198f\n"
- "187:" // Height 4: Partial direct writeback: partial_1_20
- "tbz x16, #0, 198f\n"
- "st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x9]\n"
- "st1 { v18.h }[4], [x27]\n"
- "st1 { v22.h }[4], [x25]\n"
- "b 198f\n"
- "188:" // Height 4: Partial direct writeback: partial_2_16
- "tbz x16, #1, 189f\n"
- "str s10, [x13], #0x4\n"
- "str s14, [x9], #0x4\n"
- "str s18, [x27], #0x4\n"
- "str s22, [x25], #0x4\n"
- "tbz x16, #0, 198f\n"
- "st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x9]\n"
- "st1 { v18.h }[2], [x27]\n"
- "st1 { v22.h }[2], [x25]\n"
- "b 198f\n"
- "189:" // Height 4: Partial direct writeback: partial_1_16
- "tbz x16, #0, 198f\n"
- "str h10, [x13, #0x0]\n"
- "str h14, [x9, #0x0]\n"
- "str h18, [x27, #0x0]\n"
- "str h22, [x25, #0x0]\n"
- "b 198f\n"
- "190:" // Height 4: Partial direct writeback: partial_8_0
- "tbz x16, #3, 194f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "st1 { v16.8h }, [x27], #0x10\n"
- "st1 { v20.8h }, [x25], #0x10\n"
- "tbz x16, #2, 192f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "tbz x16, #1, 191f\n"
- "st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x9], #0x4\n"
- "st1 { v17.s }[2], [x27], #0x4\n"
- "st1 { v21.s }[2], [x25], #0x4\n"
- "tbz x16, #0, 198f\n"
- "st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x9]\n"
- "st1 { v17.h }[6], [x27]\n"
- "st1 { v21.h }[6], [x25]\n"
- "b 198f\n"
- "191:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x16, #0, 198f\n"
- "st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x9]\n"
- "st1 { v17.h }[4], [x27]\n"
- "st1 { v21.h }[4], [x25]\n"
- "b 198f\n"
- "192:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x16, #1, 193f\n"
- "str s9, [x13], #0x4\n"
- "str s13, [x9], #0x4\n"
- "str s17, [x27], #0x4\n"
- "str s21, [x25], #0x4\n"
- "tbz x16, #0, 198f\n"
- "st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x9]\n"
- "st1 { v17.h }[2], [x27]\n"
- "st1 { v21.h }[2], [x25]\n"
- "b 198f\n"
- "193:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x16, #0, 198f\n"
- "str h9, [x13, #0x0]\n"
- "str h13, [x9, #0x0]\n"
- "str h17, [x27, #0x0]\n"
- "str h21, [x25, #0x0]\n"
- "b 198f\n"
- "194:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x16, #2, 196f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "tbz x16, #1, 195f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "tbz x16, #0, 198f\n"
- "st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x9]\n"
- "st1 { v16.h }[6], [x27]\n"
- "st1 { v20.h }[6], [x25]\n"
- "b 198f\n"
- "195:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x16, #0, 198f\n"
- "st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x9]\n"
- "st1 { v16.h }[4], [x27]\n"
- "st1 { v20.h }[4], [x25]\n"
- "b 198f\n"
- "196:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x16, #1, 197f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "str s16, [x27], #0x4\n"
- "str s20, [x25], #0x4\n"
- "tbz x16, #0, 198f\n"
- "st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x9]\n"
- "st1 { v16.h }[2], [x27]\n"
- "st1 { v20.h }[2], [x25]\n"
- "b 198f\n"
- "197:" // Height 4: Partial direct writeback: partial_1_0
- "str h8, [x13, #0x0]\n"
- "str h12, [x9, #0x0]\n"
- "str h16, [x27, #0x0]\n"
- "str h20, [x25, #0x0]\n"
- "198:" // Height 4: Partial direct writeback: Done
- "b 200f\n"
- "199:" // Height 4: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "200:" // Height 4: Writeback done
- "subs x16, x16, #0x20\n"
- "bgt 153b\n"
- "b 302f\n"
- "201:" // Height 5
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 202f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #1\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #1\n"
- "add x25, x25, x19, LSL #1\n"
- "add x23, x23, x19, LSL #1\n"
- "b 203f\n"
- "202:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "add x27, x9, x19, LSL #1\n"
- "add x25, x27, x19, LSL #1\n"
- "add x23, x25, x19, LSL #1\n"
- "203:" // Height 5: Column loop
- "cbz x14, 204f\n"
- "ldr q8, [x14, #0x0]\n"
+ "178:" // Height 4: No activation
+ "cmp x11, #0x20\n"
+ "bge 195f\n"
+ "tbz x11, #4, 186f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v9.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "st1 { v13.8h }, [x24], #0x10\n"
+ "st1 { v16.8h }, [x23], #0x10\n"
+ "st1 { v17.8h }, [x23], #0x10\n"
+ "st1 { v20.8h }, [x22], #0x10\n"
+ "st1 { v21.8h }, [x22], #0x10\n"
+ "tbz x11, #3, 182f\n"
+ "st1 { v10.8h }, [x28], #0x10\n"
+ "st1 { v14.8h }, [x24], #0x10\n"
+ "st1 { v18.8h }, [x23], #0x10\n"
+ "st1 { v22.8h }, [x22], #0x10\n"
+ "tbz x11, #2, 180f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "tbz x11, #1, 179f\n"
+ "st1 { v11.s }[2], [x28], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v19.s }[2], [x23], #0x4\n"
+ "st1 { v23.s }[2], [x22], #0x4\n"
+ "tbz x11, #0, 194f\n"
+ "st1 { v11.h }[6], [x28]\n"
+ "st1 { v15.h }[6], [x24]\n"
+ "st1 { v19.h }[6], [x23]\n"
+ "st1 { v23.h }[6], [x22]\n"
+ "b 194f\n"
+ "179:" // Height 4: Partial direct writeback: partial_1_28
+ "tbz x11, #0, 194f\n"
+ "st1 { v11.h }[4], [x28]\n"
+ "st1 { v15.h }[4], [x24]\n"
+ "st1 { v19.h }[4], [x23]\n"
+ "st1 { v23.h }[4], [x22]\n"
+ "b 194f\n"
+ "180:" // Height 4: Partial direct writeback: partial_2_24
+ "tbz x11, #1, 181f\n"
+ "str s11, [x28], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s19, [x23], #0x4\n"
+ "str s23, [x22], #0x4\n"
+ "tbz x11, #0, 194f\n"
+ "st1 { v11.h }[2], [x28]\n"
+ "st1 { v15.h }[2], [x24]\n"
+ "st1 { v19.h }[2], [x23]\n"
+ "st1 { v23.h }[2], [x22]\n"
+ "b 194f\n"
+ "181:" // Height 4: Partial direct writeback: partial_1_24
+ "tbz x11, #0, 194f\n"
+ "str h11, [x28, #0x0]\n"
+ "str h15, [x24, #0x0]\n"
+ "str h19, [x23, #0x0]\n"
+ "str h23, [x22, #0x0]\n"
+ "b 194f\n"
+ "182:" // Height 4: Partial direct writeback: partial_4_16
+ "tbz x11, #2, 184f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "tbz x11, #1, 183f\n"
+ "st1 { v10.s }[2], [x28], #0x4\n"
+ "st1 { v14.s }[2], [x24], #0x4\n"
+ "st1 { v18.s }[2], [x23], #0x4\n"
+ "st1 { v22.s }[2], [x22], #0x4\n"
+ "tbz x11, #0, 194f\n"
+ "st1 { v10.h }[6], [x28]\n"
+ "st1 { v14.h }[6], [x24]\n"
+ "st1 { v18.h }[6], [x23]\n"
+ "st1 { v22.h }[6], [x22]\n"
+ "b 194f\n"
+ "183:" // Height 4: Partial direct writeback: partial_1_20
+ "tbz x11, #0, 194f\n"
+ "st1 { v10.h }[4], [x28]\n"
+ "st1 { v14.h }[4], [x24]\n"
+ "st1 { v18.h }[4], [x23]\n"
+ "st1 { v22.h }[4], [x22]\n"
+ "b 194f\n"
+ "184:" // Height 4: Partial direct writeback: partial_2_16
+ "tbz x11, #1, 185f\n"
+ "str s10, [x28], #0x4\n"
+ "str s14, [x24], #0x4\n"
+ "str s18, [x23], #0x4\n"
+ "str s22, [x22], #0x4\n"
+ "tbz x11, #0, 194f\n"
+ "st1 { v10.h }[2], [x28]\n"
+ "st1 { v14.h }[2], [x24]\n"
+ "st1 { v18.h }[2], [x23]\n"
+ "st1 { v22.h }[2], [x22]\n"
+ "b 194f\n"
+ "185:" // Height 4: Partial direct writeback: partial_1_16
+ "tbz x11, #0, 194f\n"
+ "str h10, [x28, #0x0]\n"
+ "str h14, [x24, #0x0]\n"
+ "str h18, [x23, #0x0]\n"
+ "str h22, [x22, #0x0]\n"
+ "b 194f\n"
+ "186:" // Height 4: Partial direct writeback: partial_8_0
+ "tbz x11, #3, 190f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "st1 { v16.8h }, [x23], #0x10\n"
+ "st1 { v20.8h }, [x22], #0x10\n"
+ "tbz x11, #2, 188f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "tbz x11, #1, 187f\n"
+ "st1 { v9.s }[2], [x28], #0x4\n"
+ "st1 { v13.s }[2], [x24], #0x4\n"
+ "st1 { v17.s }[2], [x23], #0x4\n"
+ "st1 { v21.s }[2], [x22], #0x4\n"
+ "tbz x11, #0, 194f\n"
+ "st1 { v9.h }[6], [x28]\n"
+ "st1 { v13.h }[6], [x24]\n"
+ "st1 { v17.h }[6], [x23]\n"
+ "st1 { v21.h }[6], [x22]\n"
+ "b 194f\n"
+ "187:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 194f\n"
+ "st1 { v9.h }[4], [x28]\n"
+ "st1 { v13.h }[4], [x24]\n"
+ "st1 { v17.h }[4], [x23]\n"
+ "st1 { v21.h }[4], [x22]\n"
+ "b 194f\n"
+ "188:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 189f\n"
+ "str s9, [x28], #0x4\n"
+ "str s13, [x24], #0x4\n"
+ "str s17, [x23], #0x4\n"
+ "str s21, [x22], #0x4\n"
+ "tbz x11, #0, 194f\n"
+ "st1 { v9.h }[2], [x28]\n"
+ "st1 { v13.h }[2], [x24]\n"
+ "st1 { v17.h }[2], [x23]\n"
+ "st1 { v21.h }[2], [x22]\n"
+ "b 194f\n"
+ "189:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 194f\n"
+ "str h9, [x28, #0x0]\n"
+ "str h13, [x24, #0x0]\n"
+ "str h17, [x23, #0x0]\n"
+ "str h21, [x22, #0x0]\n"
+ "b 194f\n"
+ "190:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 192f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "tbz x11, #1, 191f\n"
+ "st1 { v8.s }[2], [x28], #0x4\n"
+ "st1 { v12.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v20.s }[2], [x22], #0x4\n"
+ "tbz x11, #0, 194f\n"
+ "st1 { v8.h }[6], [x28]\n"
+ "st1 { v12.h }[6], [x24]\n"
+ "st1 { v16.h }[6], [x23]\n"
+ "st1 { v20.h }[6], [x22]\n"
+ "b 194f\n"
+ "191:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 194f\n"
+ "st1 { v8.h }[4], [x28]\n"
+ "st1 { v12.h }[4], [x24]\n"
+ "st1 { v16.h }[4], [x23]\n"
+ "st1 { v20.h }[4], [x22]\n"
+ "b 194f\n"
+ "192:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 193f\n"
+ "str s8, [x28], #0x4\n"
+ "str s12, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s20, [x22], #0x4\n"
+ "tbz x11, #0, 194f\n"
+ "st1 { v8.h }[2], [x28]\n"
+ "st1 { v12.h }[2], [x24]\n"
+ "st1 { v16.h }[2], [x23]\n"
+ "st1 { v20.h }[2], [x22]\n"
+ "b 194f\n"
+ "193:" // Height 4: Partial direct writeback: partial_1_0
+ "str h8, [x28, #0x0]\n"
+ "str h12, [x24, #0x0]\n"
+ "str h16, [x23, #0x0]\n"
+ "str h20, [x22, #0x0]\n"
+ "194:" // Height 4: Partial direct writeback: Done
+ "b 196f\n"
+ "195:" // Height 4: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "196:" // Height 4: Writeback done
+ "subs x11, x11, #0x20\n"
+ "bgt 149b\n"
+ "b 296f\n"
+ "197:" // Height 5
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "198:" // Height 5: Column loop
+ "cbz x9, 199f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v20.16b, v8.16b\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v24.16b, v8.16b\n"
- "add x14, x14, #0x40\n"
"mov v13.16b, v9.16b\n"
"mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
@@ -2976,277 +2923,278 @@ void a64_hybrid_fp16_mla_6x32 (
"mov v25.16b, v9.16b\n"
"mov v26.16b, v10.16b\n"
"mov v27.16b, v11.16b\n"
- "b 223f\n"
- "204:" // Height 5: no bias
- "tbz %x[flags], #0, 222f\n"
- "cmp x16, #0x20\n"
- "bge 221f\n"
- "tbz x16, #4, 212f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "ld1 { v16.8h }, [x27], #0x10\n"
- "ld1 { v20.8h }, [x25], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x9], #0x10\n"
- "ld1 { v17.8h }, [x27], #0x10\n"
- "ld1 { v21.8h }, [x25], #0x10\n"
- "ld1 { v25.8h }, [x23], #0x10\n"
- "tbz x16, #3, 208f\n"
- "ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x9], #0x10\n"
- "ld1 { v18.8h }, [x27], #0x10\n"
- "ld1 { v22.8h }, [x25], #0x10\n"
- "ld1 { v26.8h }, [x23], #0x10\n"
- "tbz x16, #2, 206f\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "tbz x16, #1, 205f\n"
- "ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v19.s }[2], [x27], #0x4\n"
- "ld1 { v23.s }[2], [x25], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
+ "b 218f\n"
+ "199:" // Height 5: no bias
+ "tbz %x[flags], #0, 217f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x20\n"
+ "add x24, x28, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "bge 216f\n"
+ "tbz x11, #4, 207f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "ld1 { v16.8h }, [x23], #0x10\n"
+ "ld1 { v20.8h }, [x22], #0x10\n"
+ "ld1 { v24.8h }, [x21], #0x10\n"
+ "ld1 { v9.8h }, [x28], #0x10\n"
+ "ld1 { v13.8h }, [x24], #0x10\n"
+ "ld1 { v17.8h }, [x23], #0x10\n"
+ "ld1 { v21.8h }, [x22], #0x10\n"
+ "ld1 { v25.8h }, [x21], #0x10\n"
+ "tbz x11, #3, 203f\n"
+ "ld1 { v10.8h }, [x28], #0x10\n"
+ "ld1 { v14.8h }, [x24], #0x10\n"
+ "ld1 { v18.8h }, [x23], #0x10\n"
+ "ld1 { v22.8h }, [x22], #0x10\n"
+ "ld1 { v26.8h }, [x21], #0x10\n"
+ "tbz x11, #2, 201f\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
+ "tbz x11, #1, 200f\n"
+ "ld1 { v11.s }[2], [x28], #0x4\n"
"mov x19, #0x3c\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x9]\n"
- "ld1 { v19.h }[6], [x27]\n"
- "ld1 { v23.h }[6], [x25]\n"
- "ld1 { v27.h }[6], [x23]\n"
- "b 220f\n"
- "205:" // Height 5: Partial accumulate: partial_1_28
+ "ld1 { v15.s }[2], [x24], #0x4\n"
+ "ld1 { v19.s }[2], [x23], #0x4\n"
+ "ld1 { v23.s }[2], [x22], #0x4\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v11.h }[6], [x28]\n"
+ "ld1 { v15.h }[6], [x24]\n"
+ "ld1 { v19.h }[6], [x23]\n"
+ "ld1 { v23.h }[6], [x22]\n"
+ "ld1 { v27.h }[6], [x21]\n"
+ "b 215f\n"
+ "200:" // Height 5: Partial accumulate: partial_1_28
"mov x19, #0x38\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x9]\n"
- "ld1 { v19.h }[4], [x27]\n"
- "ld1 { v23.h }[4], [x25]\n"
- "ld1 { v27.h }[4], [x23]\n"
- "b 220f\n"
- "206:" // Height 5: Partial accumulate: partial_2_24
- "tbz x16, #1, 207f\n"
- "ldr s11, [x13], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s19, [x27], #0x4\n"
- "ldr s23, [x25], #0x4\n"
- "ldr s27, [x23], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v11.h }[4], [x28]\n"
+ "ld1 { v15.h }[4], [x24]\n"
+ "ld1 { v19.h }[4], [x23]\n"
+ "ld1 { v23.h }[4], [x22]\n"
+ "ld1 { v27.h }[4], [x21]\n"
+ "b 215f\n"
+ "201:" // Height 5: Partial accumulate: partial_2_24
+ "tbz x11, #1, 202f\n"
+ "ldr s11, [x28], #0x4\n"
+ "ldr s15, [x24], #0x4\n"
"mov x19, #0x34\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x9]\n"
- "ld1 { v19.h }[2], [x27]\n"
- "ld1 { v23.h }[2], [x25]\n"
- "ld1 { v27.h }[2], [x23]\n"
- "b 220f\n"
- "207:" // Height 5: Partial accumulate: partial_1_24
+ "ldr s19, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "ldr s27, [x21], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v11.h }[2], [x28]\n"
+ "ld1 { v15.h }[2], [x24]\n"
+ "ld1 { v19.h }[2], [x23]\n"
+ "ld1 { v23.h }[2], [x22]\n"
+ "ld1 { v27.h }[2], [x21]\n"
+ "b 215f\n"
+ "202:" // Height 5: Partial accumulate: partial_1_24
"mov x19, #0x30\n"
- "tbz x16, #0, 220f\n"
- "ldr h11, [x13, #0x0]\n"
- "ldr h15, [x9, #0x0]\n"
- "ldr h19, [x27, #0x0]\n"
- "ldr h23, [x25, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
- "b 220f\n"
- "208:" // Height 5: Partial accumulate: partial_4_16
- "tbz x16, #2, 210f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "tbz x16, #1, 209f\n"
- "ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x9], #0x4\n"
- "ld1 { v18.s }[2], [x27], #0x4\n"
- "ld1 { v22.s }[2], [x25], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ldr h11, [x28, #0x0]\n"
+ "ldr h15, [x24, #0x0]\n"
+ "ldr h19, [x23, #0x0]\n"
+ "ldr h23, [x22, #0x0]\n"
+ "ldr h27, [x21, #0x0]\n"
+ "b 215f\n"
+ "203:" // Height 5: Partial accumulate: partial_4_16
+ "tbz x11, #2, 205f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "tbz x11, #1, 204f\n"
+ "ld1 { v10.s }[2], [x28], #0x4\n"
"mov x19, #0x2c\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x9]\n"
- "ld1 { v18.h }[6], [x27]\n"
- "ld1 { v22.h }[6], [x25]\n"
- "ld1 { v26.h }[6], [x23]\n"
- "b 220f\n"
- "209:" // Height 5: Partial accumulate: partial_1_20
+ "ld1 { v14.s }[2], [x24], #0x4\n"
+ "ld1 { v18.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v26.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v10.h }[6], [x28]\n"
+ "ld1 { v14.h }[6], [x24]\n"
+ "ld1 { v18.h }[6], [x23]\n"
+ "ld1 { v22.h }[6], [x22]\n"
+ "ld1 { v26.h }[6], [x21]\n"
+ "b 215f\n"
+ "204:" // Height 5: Partial accumulate: partial_1_20
"mov x19, #0x28\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x9]\n"
- "ld1 { v18.h }[4], [x27]\n"
- "ld1 { v22.h }[4], [x25]\n"
- "ld1 { v26.h }[4], [x23]\n"
- "b 220f\n"
- "210:" // Height 5: Partial accumulate: partial_2_16
- "tbz x16, #1, 211f\n"
- "ldr s10, [x13], #0x4\n"
- "ldr s14, [x9], #0x4\n"
- "ldr s18, [x27], #0x4\n"
- "ldr s22, [x25], #0x4\n"
- "ldr s26, [x23], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v10.h }[4], [x28]\n"
+ "ld1 { v14.h }[4], [x24]\n"
+ "ld1 { v18.h }[4], [x23]\n"
+ "ld1 { v22.h }[4], [x22]\n"
+ "ld1 { v26.h }[4], [x21]\n"
+ "b 215f\n"
+ "205:" // Height 5: Partial accumulate: partial_2_16
+ "tbz x11, #1, 206f\n"
+ "ldr s10, [x28], #0x4\n"
+ "ldr s14, [x24], #0x4\n"
"mov x19, #0x24\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x9]\n"
- "ld1 { v18.h }[2], [x27]\n"
- "ld1 { v22.h }[2], [x25]\n"
- "ld1 { v26.h }[2], [x23]\n"
- "b 220f\n"
- "211:" // Height 5: Partial accumulate: partial_1_16
+ "ldr s18, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v10.h }[2], [x28]\n"
+ "ld1 { v14.h }[2], [x24]\n"
+ "ld1 { v18.h }[2], [x23]\n"
+ "ld1 { v22.h }[2], [x22]\n"
+ "ld1 { v26.h }[2], [x21]\n"
+ "b 215f\n"
+ "206:" // Height 5: Partial accumulate: partial_1_16
"mov x19, #0x20\n"
- "tbz x16, #0, 220f\n"
- "ldr h10, [x13, #0x0]\n"
- "ldr h14, [x9, #0x0]\n"
- "ldr h18, [x27, #0x0]\n"
- "ldr h22, [x25, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
- "b 220f\n"
- "212:" // Height 5: Partial accumulate: partial_8_0
- "tbz x16, #3, 216f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "ld1 { v16.8h }, [x27], #0x10\n"
- "ld1 { v20.8h }, [x25], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "tbz x16, #2, 214f\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "tbz x16, #1, 213f\n"
- "ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x9], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ldr h10, [x28, #0x0]\n"
+ "ldr h14, [x24, #0x0]\n"
+ "ldr h18, [x23, #0x0]\n"
+ "ldr h22, [x22, #0x0]\n"
+ "ldr h26, [x21, #0x0]\n"
+ "b 215f\n"
+ "207:" // Height 5: Partial accumulate: partial_8_0
+ "tbz x11, #3, 211f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "ld1 { v16.8h }, [x23], #0x10\n"
+ "ld1 { v20.8h }, [x22], #0x10\n"
+ "ld1 { v24.8h }, [x21], #0x10\n"
+ "tbz x11, #2, 209f\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d25, [x21], #0x8\n"
+ "tbz x11, #1, 208f\n"
+ "ld1 { v9.s }[2], [x28], #0x4\n"
"mov x19, #0x1c\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x9]\n"
- "ld1 { v17.h }[6], [x27]\n"
- "ld1 { v21.h }[6], [x25]\n"
- "ld1 { v25.h }[6], [x23]\n"
- "b 220f\n"
- "213:" // Height 5: Partial accumulate: partial_1_12
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v17.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v25.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v9.h }[6], [x28]\n"
+ "ld1 { v13.h }[6], [x24]\n"
+ "ld1 { v17.h }[6], [x23]\n"
+ "ld1 { v21.h }[6], [x22]\n"
+ "ld1 { v25.h }[6], [x21]\n"
+ "b 215f\n"
+ "208:" // Height 5: Partial accumulate: partial_1_12
"mov x19, #0x18\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x9]\n"
- "ld1 { v17.h }[4], [x27]\n"
- "ld1 { v21.h }[4], [x25]\n"
- "ld1 { v25.h }[4], [x23]\n"
- "b 220f\n"
- "214:" // Height 5: Partial accumulate: partial_2_8
- "tbz x16, #1, 215f\n"
- "ldr s9, [x13], #0x4\n"
- "ldr s13, [x9], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s21, [x25], #0x4\n"
- "ldr s25, [x23], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v9.h }[4], [x28]\n"
+ "ld1 { v13.h }[4], [x24]\n"
+ "ld1 { v17.h }[4], [x23]\n"
+ "ld1 { v21.h }[4], [x22]\n"
+ "ld1 { v25.h }[4], [x21]\n"
+ "b 215f\n"
+ "209:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x11, #1, 210f\n"
+ "ldr s9, [x28], #0x4\n"
+ "ldr s13, [x24], #0x4\n"
"mov x19, #0x14\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x9]\n"
- "ld1 { v17.h }[2], [x27]\n"
- "ld1 { v21.h }[2], [x25]\n"
- "ld1 { v25.h }[2], [x23]\n"
- "b 220f\n"
- "215:" // Height 5: Partial accumulate: partial_1_8
+ "ldr s17, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s25, [x21], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v9.h }[2], [x28]\n"
+ "ld1 { v13.h }[2], [x24]\n"
+ "ld1 { v17.h }[2], [x23]\n"
+ "ld1 { v21.h }[2], [x22]\n"
+ "ld1 { v25.h }[2], [x21]\n"
+ "b 215f\n"
+ "210:" // Height 5: Partial accumulate: partial_1_8
"mov x19, #0x10\n"
- "tbz x16, #0, 220f\n"
- "ldr h9, [x13, #0x0]\n"
- "ldr h13, [x9, #0x0]\n"
- "ldr h17, [x27, #0x0]\n"
- "ldr h21, [x25, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
- "b 220f\n"
- "216:" // Height 5: Partial accumulate: partial_4_0
- "tbz x16, #2, 218f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "tbz x16, #1, 217f\n"
- "ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x27], #0x4\n"
- "ld1 { v20.s }[2], [x25], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ldr h9, [x28, #0x0]\n"
+ "ldr h13, [x24, #0x0]\n"
+ "ldr h17, [x23, #0x0]\n"
+ "ldr h21, [x22, #0x0]\n"
+ "ldr h25, [x21, #0x0]\n"
+ "b 215f\n"
+ "211:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x11, #2, 213f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "tbz x11, #1, 212f\n"
+ "ld1 { v8.s }[2], [x28], #0x4\n"
"mov x19, #0xc\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x9]\n"
- "ld1 { v16.h }[6], [x27]\n"
- "ld1 { v20.h }[6], [x25]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "b 220f\n"
- "217:" // Height 5: Partial accumulate: partial_1_4
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v16.s }[2], [x23], #0x4\n"
+ "ld1 { v20.s }[2], [x22], #0x4\n"
+ "ld1 { v24.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v8.h }[6], [x28]\n"
+ "ld1 { v12.h }[6], [x24]\n"
+ "ld1 { v16.h }[6], [x23]\n"
+ "ld1 { v20.h }[6], [x22]\n"
+ "ld1 { v24.h }[6], [x21]\n"
+ "b 215f\n"
+ "212:" // Height 5: Partial accumulate: partial_1_4
"mov x19, #0x8\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x9]\n"
- "ld1 { v16.h }[4], [x27]\n"
- "ld1 { v20.h }[4], [x25]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "b 220f\n"
- "218:" // Height 5: Partial accumulate: partial_2_0
- "tbz x16, #1, 219f\n"
- "ldr s8, [x13], #0x4\n"
- "ldr s12, [x9], #0x4\n"
- "ldr s16, [x27], #0x4\n"
- "ldr s20, [x25], #0x4\n"
- "ldr s24, [x23], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v8.h }[4], [x28]\n"
+ "ld1 { v12.h }[4], [x24]\n"
+ "ld1 { v16.h }[4], [x23]\n"
+ "ld1 { v20.h }[4], [x22]\n"
+ "ld1 { v24.h }[4], [x21]\n"
+ "b 215f\n"
+ "213:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x11, #1, 214f\n"
+ "ldr s8, [x28], #0x4\n"
+ "ldr s12, [x24], #0x4\n"
"mov x19, #0x4\n"
- "tbz x16, #0, 220f\n"
- "ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x9]\n"
- "ld1 { v16.h }[2], [x27]\n"
- "ld1 { v20.h }[2], [x25]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "b 220f\n"
- "219:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s16, [x23], #0x4\n"
+ "ldr s20, [x22], #0x4\n"
+ "ldr s24, [x21], #0x4\n"
+ "tbz x11, #0, 215f\n"
+ "ld1 { v8.h }[2], [x28]\n"
+ "ld1 { v12.h }[2], [x24]\n"
+ "ld1 { v16.h }[2], [x23]\n"
+ "ld1 { v20.h }[2], [x22]\n"
+ "ld1 { v24.h }[2], [x21]\n"
+ "b 215f\n"
+ "214:" // Height 5: Partial accumulate: partial_1_0
+ "ldr h8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr h8, [x13, #0x0]\n"
- "ldr h12, [x9, #0x0]\n"
- "ldr h16, [x27, #0x0]\n"
- "ldr h20, [x25, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "220:" // Height 5: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "b 223f\n"
- "221:" // Height 5: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "b 223f\n"
- "222:" // Height 5: no accumulate
+ "ldr h12, [x24, #0x0]\n"
+ "ldr h16, [x23, #0x0]\n"
+ "ldr h20, [x22, #0x0]\n"
+ "ldr h24, [x21, #0x0]\n"
+ "215:" // Height 5: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 218f\n"
+ "216:" // Height 5: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "b 218f\n"
+ "217:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -3267,452 +3215,452 @@ void a64_hybrid_fp16_mla_6x32 (
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
"movi v27.16b, #0x0\n"
- "223:" // Height 5: setup done
- "mov x12, #0x0\n"
- "224:" // Height 5: String loop
+ "218:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "219:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 225f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 220f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 226f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 221f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
"add x22, x22, x19, LSL #1\n"
- "b 226f\n"
- "225:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "add x22, x24, x19, LSL #1\n"
- "226:" // Height 5: input setup done
- "cmp x11, #0x8\n"
- "blt 229f\n"
- "cmp x11, #0x10\n"
- "blt 228f\n"
- "227:" // Height 5: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x21, x21, x19, LSL #1\n"
+ "b 221f\n"
+ "220:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "221:" // Height 5: input setup done
+ "cmp x26, #0x8\n"
+ "blt 224f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 223f\n"
+ "222:" // Height 5: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x21, x21, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x26, x26, #0x8\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "cmp x26, #0x10\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "sub x11, x11, #0x8\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "cmp x11, #0x10\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
"fmla v26.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
"fmla v27.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
"fmla v20.8h, v6.8h, v3.h[1]\n"
"fmla v24.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
"fmla v21.8h, v7.8h, v3.h[1]\n"
"fmla v25.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
"fmla v22.8h, v6.8h, v3.h[1]\n"
"fmla v26.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
"fmla v23.8h, v7.8h, v3.h[1]\n"
"fmla v27.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
"fmla v20.8h, v6.8h, v3.h[2]\n"
"fmla v24.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
"fmla v21.8h, v7.8h, v3.h[2]\n"
"fmla v25.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
"fmla v22.8h, v6.8h, v3.h[2]\n"
"fmla v26.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
"fmla v23.8h, v7.8h, v3.h[2]\n"
"fmla v27.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
"fmla v20.8h, v6.8h, v3.h[3]\n"
"fmla v24.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
"fmla v21.8h, v7.8h, v3.h[3]\n"
"fmla v25.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
"fmla v22.8h, v6.8h, v3.h[3]\n"
"fmla v26.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
"fmla v23.8h, v7.8h, v3.h[3]\n"
"fmla v27.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
"fmla v20.8h, v6.8h, v3.h[4]\n"
"fmla v24.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
"fmla v21.8h, v7.8h, v3.h[4]\n"
"fmla v25.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
"fmla v22.8h, v6.8h, v3.h[4]\n"
"fmla v26.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
"fmla v23.8h, v7.8h, v3.h[4]\n"
"fmla v27.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
"fmla v20.8h, v6.8h, v3.h[5]\n"
"fmla v24.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
"fmla v21.8h, v7.8h, v3.h[5]\n"
"fmla v25.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
"fmla v22.8h, v6.8h, v3.h[5]\n"
"fmla v26.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
"fmla v23.8h, v7.8h, v3.h[5]\n"
"fmla v27.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
"fmla v20.8h, v6.8h, v3.h[6]\n"
"fmla v24.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
"fmla v21.8h, v7.8h, v3.h[6]\n"
"fmla v25.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
"fmla v22.8h, v6.8h, v3.h[6]\n"
"fmla v26.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
"fmla v23.8h, v7.8h, v3.h[6]\n"
"fmla v27.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
"fmla v20.8h, v6.8h, v3.h[7]\n"
"fmla v24.8h, v6.8h, v4.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
"fmla v21.8h, v7.8h, v3.h[7]\n"
"fmla v25.8h, v7.8h, v4.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
- "add x15, x15, #0x200\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
"fmla v22.8h, v6.8h, v3.h[7]\n"
"fmla v26.8h, v6.8h, v4.h[7]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.8h, v7.8h, v1.h[7]\n"
+ "ldr q1, [x24, #0x0]\n"
"fmla v19.8h, v7.8h, v2.h[7]\n"
+ "ldr q2, [x23, #0x0]\n"
"fmla v23.8h, v7.8h, v3.h[7]\n"
+ "ldr q3, [x22, #0x0]\n"
"fmla v27.8h, v7.8h, v4.h[7]\n"
- "bge 227b\n"
- "228:" // Height 5: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "bge 222b\n"
+ "223:" // Height 5: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x21, x21, #0x10\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x22, x22, #0x10\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
"fmla v26.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
"fmla v27.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
"fmla v20.8h, v6.8h, v3.h[1]\n"
"fmla v24.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
"fmla v21.8h, v7.8h, v3.h[1]\n"
"fmla v25.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
"fmla v22.8h, v6.8h, v3.h[1]\n"
"fmla v26.8h, v6.8h, v4.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
"fmla v23.8h, v7.8h, v3.h[1]\n"
"fmla v27.8h, v7.8h, v4.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
"fmla v20.8h, v6.8h, v3.h[2]\n"
"fmla v24.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
"fmla v21.8h, v7.8h, v3.h[2]\n"
"fmla v25.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
"fmla v22.8h, v6.8h, v3.h[2]\n"
"fmla v26.8h, v6.8h, v4.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
"fmla v23.8h, v7.8h, v3.h[2]\n"
"fmla v27.8h, v7.8h, v4.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
"fmla v20.8h, v6.8h, v3.h[3]\n"
"fmla v24.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
"fmla v21.8h, v7.8h, v3.h[3]\n"
"fmla v25.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
"fmla v22.8h, v6.8h, v3.h[3]\n"
"fmla v26.8h, v6.8h, v4.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
"fmla v23.8h, v7.8h, v3.h[3]\n"
"fmla v27.8h, v7.8h, v4.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
"fmla v20.8h, v6.8h, v3.h[4]\n"
"fmla v24.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
"fmla v21.8h, v7.8h, v3.h[4]\n"
"fmla v25.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
"fmla v22.8h, v6.8h, v3.h[4]\n"
"fmla v26.8h, v6.8h, v4.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
"fmla v23.8h, v7.8h, v3.h[4]\n"
"fmla v27.8h, v7.8h, v4.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
"fmla v20.8h, v6.8h, v3.h[5]\n"
"fmla v24.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
"fmla v21.8h, v7.8h, v3.h[5]\n"
"fmla v25.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
"fmla v22.8h, v6.8h, v3.h[5]\n"
"fmla v26.8h, v6.8h, v4.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
"fmla v23.8h, v7.8h, v3.h[5]\n"
"fmla v27.8h, v7.8h, v4.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
"fmla v20.8h, v6.8h, v3.h[6]\n"
"fmla v24.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
"fmla v21.8h, v7.8h, v3.h[6]\n"
"fmla v25.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
"fmla v22.8h, v6.8h, v3.h[6]\n"
"fmla v26.8h, v6.8h, v4.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
"fmla v23.8h, v7.8h, v3.h[6]\n"
"fmla v27.8h, v7.8h, v4.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
"fmla v20.8h, v6.8h, v3.h[7]\n"
"fmla v24.8h, v6.8h, v4.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
"fmla v21.8h, v7.8h, v3.h[7]\n"
"fmla v25.8h, v7.8h, v4.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
- "add x15, x15, #0x200\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
"fmla v22.8h, v6.8h, v3.h[7]\n"
@@ -3722,31 +3670,31 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v19.8h, v7.8h, v2.h[7]\n"
"fmla v23.8h, v7.8h, v3.h[7]\n"
"fmla v27.8h, v7.8h, v4.h[7]\n"
- "229:" // Height 5: Multiply loop: Main loop skip
- "cbz x11, 231f\n"
- "230:" // Height 5: Multiply loop: Odd block loop
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "ldr q6, [x15, #0x0]\n"
+ "224:" // Height 5: Multiply loop: Main loop skip
+ "cbz x26, 226f\n"
+ "225:" // Height 5: Multiply loop: Odd block loop
+ "ldr h0, [x25], #0x2\n"
+ "sub x26, x26, #0x1\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h4, [x21], #0x2\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "sub x11, x11, #0x1\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
@@ -3756,18 +3704,23 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v19.8h, v7.8h, v2.h[0]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
"fmla v27.8h, v7.8h, v4.h[0]\n"
- "cbnz x11, 230b\n"
- "231:" // Height 5: Multiply loop: No odd multiplies
+ "cbnz x26, 225b\n"
+ "226:" // Height 5: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 224b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 219b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #1\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "tbz %x[flags], #1, 232f\n"
+ "add x22, x23, x19, LSL #1\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #1\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 227f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.8h }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -3812,305 +3765,281 @@ void a64_hybrid_fp16_mla_6x32 (
"fmax v26.8h, v26.8h, v1.8h\n"
"fmin v27.8h, v27.8h, v0.8h\n"
"fmax v27.8h, v27.8h, v1.8h\n"
- "232:" // Height 5: No activation
- "cmp x16, #0x20\n"
- "bge 249f\n"
- "tbz x16, #4, 240f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "st1 { v13.8h }, [x9], #0x10\n"
- "st1 { v16.8h }, [x27], #0x10\n"
- "st1 { v17.8h }, [x27], #0x10\n"
- "st1 { v20.8h }, [x25], #0x10\n"
- "st1 { v21.8h }, [x25], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v25.8h }, [x23], #0x10\n"
- "tbz x16, #3, 236f\n"
- "st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x9], #0x10\n"
- "st1 { v18.8h }, [x27], #0x10\n"
- "st1 { v22.8h }, [x25], #0x10\n"
- "st1 { v26.8h }, [x23], #0x10\n"
- "tbz x16, #2, 234f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "tbz x16, #1, 233f\n"
- "st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x9], #0x4\n"
- "st1 { v19.s }[2], [x27], #0x4\n"
- "st1 { v23.s }[2], [x25], #0x4\n"
- "st1 { v27.s }[2], [x23], #0x4\n"
- "tbz x16, #0, 248f\n"
- "st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x9]\n"
- "st1 { v19.h }[6], [x27]\n"
- "st1 { v23.h }[6], [x25]\n"
- "st1 { v27.h }[6], [x23]\n"
- "b 248f\n"
- "233:" // Height 5: Partial direct writeback: partial_1_28
- "tbz x16, #0, 248f\n"
- "st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x9]\n"
- "st1 { v19.h }[4], [x27]\n"
- "st1 { v23.h }[4], [x25]\n"
- "st1 { v27.h }[4], [x23]\n"
- "b 248f\n"
- "234:" // Height 5: Partial direct writeback: partial_2_24
- "tbz x16, #1, 235f\n"
- "str s11, [x13], #0x4\n"
- "str s15, [x9], #0x4\n"
- "str s19, [x27], #0x4\n"
- "str s23, [x25], #0x4\n"
- "str s27, [x23], #0x4\n"
- "tbz x16, #0, 248f\n"
- "st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x9]\n"
- "st1 { v19.h }[2], [x27]\n"
- "st1 { v23.h }[2], [x25]\n"
- "st1 { v27.h }[2], [x23]\n"
- "b 248f\n"
- "235:" // Height 5: Partial direct writeback: partial_1_24
- "tbz x16, #0, 248f\n"
- "str h11, [x13, #0x0]\n"
- "str h15, [x9, #0x0]\n"
- "str h19, [x27, #0x0]\n"
- "str h23, [x25, #0x0]\n"
- "str h27, [x23, #0x0]\n"
- "b 248f\n"
- "236:" // Height 5: Partial direct writeback: partial_4_16
- "tbz x16, #2, 238f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "tbz x16, #1, 237f\n"
- "st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x9], #0x4\n"
- "st1 { v18.s }[2], [x27], #0x4\n"
- "st1 { v22.s }[2], [x25], #0x4\n"
- "st1 { v26.s }[2], [x23], #0x4\n"
- "tbz x16, #0, 248f\n"
- "st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x9]\n"
- "st1 { v18.h }[6], [x27]\n"
- "st1 { v22.h }[6], [x25]\n"
- "st1 { v26.h }[6], [x23]\n"
- "b 248f\n"
- "237:" // Height 5: Partial direct writeback: partial_1_20
- "tbz x16, #0, 248f\n"
- "st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x9]\n"
- "st1 { v18.h }[4], [x27]\n"
- "st1 { v22.h }[4], [x25]\n"
- "st1 { v26.h }[4], [x23]\n"
- "b 248f\n"
- "238:" // Height 5: Partial direct writeback: partial_2_16
- "tbz x16, #1, 239f\n"
- "str s10, [x13], #0x4\n"
- "str s14, [x9], #0x4\n"
- "str s18, [x27], #0x4\n"
- "str s22, [x25], #0x4\n"
- "str s26, [x23], #0x4\n"
- "tbz x16, #0, 248f\n"
- "st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x9]\n"
- "st1 { v18.h }[2], [x27]\n"
- "st1 { v22.h }[2], [x25]\n"
- "st1 { v26.h }[2], [x23]\n"
- "b 248f\n"
- "239:" // Height 5: Partial direct writeback: partial_1_16
- "tbz x16, #0, 248f\n"
- "str h10, [x13, #0x0]\n"
- "str h14, [x9, #0x0]\n"
- "str h18, [x27, #0x0]\n"
- "str h22, [x25, #0x0]\n"
- "str h26, [x23, #0x0]\n"
- "b 248f\n"
- "240:" // Height 5: Partial direct writeback: partial_8_0
- "tbz x16, #3, 244f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "st1 { v16.8h }, [x27], #0x10\n"
- "st1 { v20.8h }, [x25], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "tbz x16, #2, 242f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "tbz x16, #1, 241f\n"
- "st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x9], #0x4\n"
- "st1 { v17.s }[2], [x27], #0x4\n"
- "st1 { v21.s }[2], [x25], #0x4\n"
- "st1 { v25.s }[2], [x23], #0x4\n"
- "tbz x16, #0, 248f\n"
- "st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x9]\n"
- "st1 { v17.h }[6], [x27]\n"
- "st1 { v21.h }[6], [x25]\n"
- "st1 { v25.h }[6], [x23]\n"
- "b 248f\n"
- "241:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x16, #0, 248f\n"
- "st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x9]\n"
- "st1 { v17.h }[4], [x27]\n"
- "st1 { v21.h }[4], [x25]\n"
- "st1 { v25.h }[4], [x23]\n"
- "b 248f\n"
- "242:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x16, #1, 243f\n"
- "str s9, [x13], #0x4\n"
- "str s13, [x9], #0x4\n"
- "str s17, [x27], #0x4\n"
- "str s21, [x25], #0x4\n"
- "str s25, [x23], #0x4\n"
- "tbz x16, #0, 248f\n"
- "st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x9]\n"
- "st1 { v17.h }[2], [x27]\n"
- "st1 { v21.h }[2], [x25]\n"
- "st1 { v25.h }[2], [x23]\n"
- "b 248f\n"
- "243:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x16, #0, 248f\n"
- "str h9, [x13, #0x0]\n"
- "str h13, [x9, #0x0]\n"
- "str h17, [x27, #0x0]\n"
- "str h21, [x25, #0x0]\n"
- "str h25, [x23, #0x0]\n"
- "b 248f\n"
- "244:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x16, #2, 246f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x16, #1, 245f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x16, #0, 248f\n"
- "st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x9]\n"
- "st1 { v16.h }[6], [x27]\n"
- "st1 { v20.h }[6], [x25]\n"
- "st1 { v24.h }[6], [x23]\n"
- "b 248f\n"
- "245:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x16, #0, 248f\n"
- "st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x9]\n"
- "st1 { v16.h }[4], [x27]\n"
- "st1 { v20.h }[4], [x25]\n"
- "st1 { v24.h }[4], [x23]\n"
- "b 248f\n"
- "246:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x16, #1, 247f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "str s16, [x27], #0x4\n"
- "str s20, [x25], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x16, #0, 248f\n"
- "st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x9]\n"
- "st1 { v16.h }[2], [x27]\n"
- "st1 { v20.h }[2], [x25]\n"
- "st1 { v24.h }[2], [x23]\n"
- "b 248f\n"
- "247:" // Height 5: Partial direct writeback: partial_1_0
- "str h8, [x13, #0x0]\n"
- "str h12, [x9, #0x0]\n"
- "str h16, [x27, #0x0]\n"
- "str h20, [x25, #0x0]\n"
- "str h24, [x23, #0x0]\n"
- "248:" // Height 5: Partial direct writeback: Done
- "b 250f\n"
- "249:" // Height 5: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "250:" // Height 5: Writeback done
- "subs x16, x16, #0x20\n"
- "bgt 203b\n"
- "b 302f\n"
- "251:" // Height 6
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 252f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #1\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #1\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #1\n"
- "add x23, x23, x19, LSL #1\n"
- "add x21, x21, x19, LSL #1\n"
- "b 253f\n"
- "252:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "add x27, x9, x19, LSL #1\n"
- "add x25, x27, x19, LSL #1\n"
- "add x23, x25, x19, LSL #1\n"
- "add x21, x23, x19, LSL #1\n"
- "add %x[output_ptr], x21, x19, LSL #1\n"
- "253:" // Height 6: Column loop
- "cbz x14, 254f\n"
- "ldr q8, [x14, #0x0]\n"
+ "227:" // Height 5: No activation
+ "cmp x11, #0x20\n"
+ "bge 244f\n"
+ "tbz x11, #4, 235f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v9.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "st1 { v13.8h }, [x24], #0x10\n"
+ "st1 { v16.8h }, [x23], #0x10\n"
+ "st1 { v17.8h }, [x23], #0x10\n"
+ "st1 { v20.8h }, [x22], #0x10\n"
+ "st1 { v21.8h }, [x22], #0x10\n"
+ "st1 { v24.8h }, [x21], #0x10\n"
+ "st1 { v25.8h }, [x21], #0x10\n"
+ "tbz x11, #3, 231f\n"
+ "st1 { v10.8h }, [x28], #0x10\n"
+ "st1 { v14.8h }, [x24], #0x10\n"
+ "st1 { v18.8h }, [x23], #0x10\n"
+ "st1 { v22.8h }, [x22], #0x10\n"
+ "st1 { v26.8h }, [x21], #0x10\n"
+ "tbz x11, #2, 229f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
+ "tbz x11, #1, 228f\n"
+ "st1 { v11.s }[2], [x28], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v19.s }[2], [x23], #0x4\n"
+ "st1 { v23.s }[2], [x22], #0x4\n"
+ "st1 { v27.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 243f\n"
+ "st1 { v11.h }[6], [x28]\n"
+ "st1 { v15.h }[6], [x24]\n"
+ "st1 { v19.h }[6], [x23]\n"
+ "st1 { v23.h }[6], [x22]\n"
+ "st1 { v27.h }[6], [x21]\n"
+ "b 243f\n"
+ "228:" // Height 5: Partial direct writeback: partial_1_28
+ "tbz x11, #0, 243f\n"
+ "st1 { v11.h }[4], [x28]\n"
+ "st1 { v15.h }[4], [x24]\n"
+ "st1 { v19.h }[4], [x23]\n"
+ "st1 { v23.h }[4], [x22]\n"
+ "st1 { v27.h }[4], [x21]\n"
+ "b 243f\n"
+ "229:" // Height 5: Partial direct writeback: partial_2_24
+ "tbz x11, #1, 230f\n"
+ "str s11, [x28], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s19, [x23], #0x4\n"
+ "str s23, [x22], #0x4\n"
+ "str s27, [x21], #0x4\n"
+ "tbz x11, #0, 243f\n"
+ "st1 { v11.h }[2], [x28]\n"
+ "st1 { v15.h }[2], [x24]\n"
+ "st1 { v19.h }[2], [x23]\n"
+ "st1 { v23.h }[2], [x22]\n"
+ "st1 { v27.h }[2], [x21]\n"
+ "b 243f\n"
+ "230:" // Height 5: Partial direct writeback: partial_1_24
+ "tbz x11, #0, 243f\n"
+ "str h11, [x28, #0x0]\n"
+ "str h15, [x24, #0x0]\n"
+ "str h19, [x23, #0x0]\n"
+ "str h23, [x22, #0x0]\n"
+ "str h27, [x21, #0x0]\n"
+ "b 243f\n"
+ "231:" // Height 5: Partial direct writeback: partial_4_16
+ "tbz x11, #2, 233f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
+ "tbz x11, #1, 232f\n"
+ "st1 { v10.s }[2], [x28], #0x4\n"
+ "st1 { v14.s }[2], [x24], #0x4\n"
+ "st1 { v18.s }[2], [x23], #0x4\n"
+ "st1 { v22.s }[2], [x22], #0x4\n"
+ "st1 { v26.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 243f\n"
+ "st1 { v10.h }[6], [x28]\n"
+ "st1 { v14.h }[6], [x24]\n"
+ "st1 { v18.h }[6], [x23]\n"
+ "st1 { v22.h }[6], [x22]\n"
+ "st1 { v26.h }[6], [x21]\n"
+ "b 243f\n"
+ "232:" // Height 5: Partial direct writeback: partial_1_20
+ "tbz x11, #0, 243f\n"
+ "st1 { v10.h }[4], [x28]\n"
+ "st1 { v14.h }[4], [x24]\n"
+ "st1 { v18.h }[4], [x23]\n"
+ "st1 { v22.h }[4], [x22]\n"
+ "st1 { v26.h }[4], [x21]\n"
+ "b 243f\n"
+ "233:" // Height 5: Partial direct writeback: partial_2_16
+ "tbz x11, #1, 234f\n"
+ "str s10, [x28], #0x4\n"
+ "str s14, [x24], #0x4\n"
+ "str s18, [x23], #0x4\n"
+ "str s22, [x22], #0x4\n"
+ "str s26, [x21], #0x4\n"
+ "tbz x11, #0, 243f\n"
+ "st1 { v10.h }[2], [x28]\n"
+ "st1 { v14.h }[2], [x24]\n"
+ "st1 { v18.h }[2], [x23]\n"
+ "st1 { v22.h }[2], [x22]\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "b 243f\n"
+ "234:" // Height 5: Partial direct writeback: partial_1_16
+ "tbz x11, #0, 243f\n"
+ "str h10, [x28, #0x0]\n"
+ "str h14, [x24, #0x0]\n"
+ "str h18, [x23, #0x0]\n"
+ "str h22, [x22, #0x0]\n"
+ "str h26, [x21, #0x0]\n"
+ "b 243f\n"
+ "235:" // Height 5: Partial direct writeback: partial_8_0
+ "tbz x11, #3, 239f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "st1 { v16.8h }, [x23], #0x10\n"
+ "st1 { v20.8h }, [x22], #0x10\n"
+ "st1 { v24.8h }, [x21], #0x10\n"
+ "tbz x11, #2, 237f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
+ "tbz x11, #1, 236f\n"
+ "st1 { v9.s }[2], [x28], #0x4\n"
+ "st1 { v13.s }[2], [x24], #0x4\n"
+ "st1 { v17.s }[2], [x23], #0x4\n"
+ "st1 { v21.s }[2], [x22], #0x4\n"
+ "st1 { v25.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 243f\n"
+ "st1 { v9.h }[6], [x28]\n"
+ "st1 { v13.h }[6], [x24]\n"
+ "st1 { v17.h }[6], [x23]\n"
+ "st1 { v21.h }[6], [x22]\n"
+ "st1 { v25.h }[6], [x21]\n"
+ "b 243f\n"
+ "236:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 243f\n"
+ "st1 { v9.h }[4], [x28]\n"
+ "st1 { v13.h }[4], [x24]\n"
+ "st1 { v17.h }[4], [x23]\n"
+ "st1 { v21.h }[4], [x22]\n"
+ "st1 { v25.h }[4], [x21]\n"
+ "b 243f\n"
+ "237:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 238f\n"
+ "str s9, [x28], #0x4\n"
+ "str s13, [x24], #0x4\n"
+ "str s17, [x23], #0x4\n"
+ "str s21, [x22], #0x4\n"
+ "str s25, [x21], #0x4\n"
+ "tbz x11, #0, 243f\n"
+ "st1 { v9.h }[2], [x28]\n"
+ "st1 { v13.h }[2], [x24]\n"
+ "st1 { v17.h }[2], [x23]\n"
+ "st1 { v21.h }[2], [x22]\n"
+ "st1 { v25.h }[2], [x21]\n"
+ "b 243f\n"
+ "238:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 243f\n"
+ "str h9, [x28, #0x0]\n"
+ "str h13, [x24, #0x0]\n"
+ "str h17, [x23, #0x0]\n"
+ "str h21, [x22, #0x0]\n"
+ "str h25, [x21, #0x0]\n"
+ "b 243f\n"
+ "239:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 241f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
+ "tbz x11, #1, 240f\n"
+ "st1 { v8.s }[2], [x28], #0x4\n"
+ "st1 { v12.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v20.s }[2], [x22], #0x4\n"
+ "st1 { v24.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 243f\n"
+ "st1 { v8.h }[6], [x28]\n"
+ "st1 { v12.h }[6], [x24]\n"
+ "st1 { v16.h }[6], [x23]\n"
+ "st1 { v20.h }[6], [x22]\n"
+ "st1 { v24.h }[6], [x21]\n"
+ "b 243f\n"
+ "240:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 243f\n"
+ "st1 { v8.h }[4], [x28]\n"
+ "st1 { v12.h }[4], [x24]\n"
+ "st1 { v16.h }[4], [x23]\n"
+ "st1 { v20.h }[4], [x22]\n"
+ "st1 { v24.h }[4], [x21]\n"
+ "b 243f\n"
+ "241:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 242f\n"
+ "str s8, [x28], #0x4\n"
+ "str s12, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s20, [x22], #0x4\n"
+ "str s24, [x21], #0x4\n"
+ "tbz x11, #0, 243f\n"
+ "st1 { v8.h }[2], [x28]\n"
+ "st1 { v12.h }[2], [x24]\n"
+ "st1 { v16.h }[2], [x23]\n"
+ "st1 { v20.h }[2], [x22]\n"
+ "st1 { v24.h }[2], [x21]\n"
+ "b 243f\n"
+ "242:" // Height 5: Partial direct writeback: partial_1_0
+ "str h8, [x28, #0x0]\n"
+ "str h12, [x24, #0x0]\n"
+ "str h16, [x23, #0x0]\n"
+ "str h20, [x22, #0x0]\n"
+ "str h24, [x21, #0x0]\n"
+ "243:" // Height 5: Partial direct writeback: Done
+ "b 245f\n"
+ "244:" // Height 5: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q25, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
+ "245:" // Height 5: Writeback done
+ "subs x11, x11, #0x20\n"
+ "bgt 198b\n"
+ "b 296f\n"
+ "246:" // Height 6
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0xc\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "247:" // Height 6: Column loop
+ "cbz x9, 248f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v20.16b, v8.16b\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v24.16b, v8.16b\n"
- "add x14, x14, #0x40\n"
"mov v28.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
"mov v21.16b, v9.16b\n"
@@ -4122,314 +4051,315 @@ void a64_hybrid_fp16_mla_6x32 (
"mov v29.16b, v9.16b\n"
"mov v30.16b, v10.16b\n"
"mov v31.16b, v11.16b\n"
- "b 273f\n"
- "254:" // Height 6: no bias
- "tbz %x[flags], #0, 272f\n"
- "cmp x16, #0x20\n"
- "bge 271f\n"
- "tbz x16, #4, 262f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "ld1 { v16.8h }, [x27], #0x10\n"
- "ld1 { v20.8h }, [x25], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "ld1 { v28.8h }, [x21], #0x10\n"
- "ld1 { v9.8h }, [x13], #0x10\n"
- "ld1 { v13.8h }, [x9], #0x10\n"
- "ld1 { v17.8h }, [x27], #0x10\n"
- "ld1 { v21.8h }, [x25], #0x10\n"
- "ld1 { v25.8h }, [x23], #0x10\n"
- "ld1 { v29.8h }, [x21], #0x10\n"
- "tbz x16, #3, 258f\n"
- "ld1 { v10.8h }, [x13], #0x10\n"
- "ld1 { v14.8h }, [x9], #0x10\n"
- "ld1 { v18.8h }, [x27], #0x10\n"
- "ld1 { v22.8h }, [x25], #0x10\n"
- "ld1 { v26.8h }, [x23], #0x10\n"
- "ld1 { v30.8h }, [x21], #0x10\n"
- "tbz x16, #2, 256f\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x21], #0x8\n"
- "tbz x16, #1, 255f\n"
- "ld1 { v11.s }[2], [x13], #0x4\n"
- "ld1 { v15.s }[2], [x9], #0x4\n"
- "ld1 { v19.s }[2], [x27], #0x4\n"
- "ld1 { v23.s }[2], [x25], #0x4\n"
- "ld1 { v27.s }[2], [x23], #0x4\n"
- "ld1 { v31.s }[2], [x21], #0x4\n"
+ "b 267f\n"
+ "248:" // Height 6: no bias
+ "tbz %x[flags], #0, 266f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x20\n"
+ "add x24, x28, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "add x20, x21, x19, LSL #1\n"
+ "bge 265f\n"
+ "tbz x11, #4, 256f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "ld1 { v16.8h }, [x23], #0x10\n"
+ "ld1 { v20.8h }, [x22], #0x10\n"
+ "ld1 { v24.8h }, [x21], #0x10\n"
+ "ld1 { v28.8h }, [x20], #0x10\n"
+ "ld1 { v9.8h }, [x28], #0x10\n"
+ "ld1 { v13.8h }, [x24], #0x10\n"
+ "ld1 { v17.8h }, [x23], #0x10\n"
+ "ld1 { v21.8h }, [x22], #0x10\n"
+ "ld1 { v25.8h }, [x21], #0x10\n"
+ "ld1 { v29.8h }, [x20], #0x10\n"
+ "tbz x11, #3, 252f\n"
+ "ld1 { v10.8h }, [x28], #0x10\n"
+ "ld1 { v14.8h }, [x24], #0x10\n"
+ "ld1 { v18.8h }, [x23], #0x10\n"
+ "ld1 { v22.8h }, [x22], #0x10\n"
+ "ld1 { v26.8h }, [x21], #0x10\n"
+ "ld1 { v30.8h }, [x20], #0x10\n"
+ "tbz x11, #2, 250f\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
+ "ldr d31, [x20], #0x8\n"
+ "tbz x11, #1, 249f\n"
+ "ld1 { v11.s }[2], [x28], #0x4\n"
"mov x19, #0x3c\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v11.h }[6], [x13]\n"
- "ld1 { v15.h }[6], [x9]\n"
- "ld1 { v19.h }[6], [x27]\n"
- "ld1 { v23.h }[6], [x25]\n"
- "ld1 { v27.h }[6], [x23]\n"
- "ld1 { v31.h }[6], [x21]\n"
- "b 270f\n"
- "255:" // Height 6: Partial accumulate: partial_1_28
+ "ld1 { v15.s }[2], [x24], #0x4\n"
+ "ld1 { v19.s }[2], [x23], #0x4\n"
+ "ld1 { v23.s }[2], [x22], #0x4\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
+ "ld1 { v31.s }[2], [x20], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v11.h }[6], [x28]\n"
+ "ld1 { v15.h }[6], [x24]\n"
+ "ld1 { v19.h }[6], [x23]\n"
+ "ld1 { v23.h }[6], [x22]\n"
+ "ld1 { v27.h }[6], [x21]\n"
+ "ld1 { v31.h }[6], [x20]\n"
+ "b 264f\n"
+ "249:" // Height 6: Partial accumulate: partial_1_28
"mov x19, #0x38\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v11.h }[4], [x13]\n"
- "ld1 { v15.h }[4], [x9]\n"
- "ld1 { v19.h }[4], [x27]\n"
- "ld1 { v23.h }[4], [x25]\n"
- "ld1 { v27.h }[4], [x23]\n"
- "ld1 { v31.h }[4], [x21]\n"
- "b 270f\n"
- "256:" // Height 6: Partial accumulate: partial_2_24
- "tbz x16, #1, 257f\n"
- "ldr s11, [x13], #0x4\n"
- "ldr s15, [x9], #0x4\n"
- "ldr s19, [x27], #0x4\n"
- "ldr s23, [x25], #0x4\n"
- "ldr s27, [x23], #0x4\n"
- "ldr s31, [x21], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v11.h }[4], [x28]\n"
+ "ld1 { v15.h }[4], [x24]\n"
+ "ld1 { v19.h }[4], [x23]\n"
+ "ld1 { v23.h }[4], [x22]\n"
+ "ld1 { v27.h }[4], [x21]\n"
+ "ld1 { v31.h }[4], [x20]\n"
+ "b 264f\n"
+ "250:" // Height 6: Partial accumulate: partial_2_24
+ "tbz x11, #1, 251f\n"
+ "ldr s11, [x28], #0x4\n"
+ "ldr s15, [x24], #0x4\n"
"mov x19, #0x34\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v11.h }[2], [x13]\n"
- "ld1 { v15.h }[2], [x9]\n"
- "ld1 { v19.h }[2], [x27]\n"
- "ld1 { v23.h }[2], [x25]\n"
- "ld1 { v27.h }[2], [x23]\n"
- "ld1 { v31.h }[2], [x21]\n"
- "b 270f\n"
- "257:" // Height 6: Partial accumulate: partial_1_24
+ "ldr s19, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "ldr s27, [x21], #0x4\n"
+ "ldr s31, [x20], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v11.h }[2], [x28]\n"
+ "ld1 { v15.h }[2], [x24]\n"
+ "ld1 { v19.h }[2], [x23]\n"
+ "ld1 { v23.h }[2], [x22]\n"
+ "ld1 { v27.h }[2], [x21]\n"
+ "ld1 { v31.h }[2], [x20]\n"
+ "b 264f\n"
+ "251:" // Height 6: Partial accumulate: partial_1_24
"mov x19, #0x30\n"
- "tbz x16, #0, 270f\n"
- "ldr h11, [x13, #0x0]\n"
- "ldr h15, [x9, #0x0]\n"
- "ldr h19, [x27, #0x0]\n"
- "ldr h23, [x25, #0x0]\n"
- "ldr h27, [x23, #0x0]\n"
- "ldr h31, [x21, #0x0]\n"
- "b 270f\n"
- "258:" // Height 6: Partial accumulate: partial_4_16
- "tbz x16, #2, 260f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x21], #0x8\n"
- "tbz x16, #1, 259f\n"
- "ld1 { v10.s }[2], [x13], #0x4\n"
- "ld1 { v14.s }[2], [x9], #0x4\n"
- "ld1 { v18.s }[2], [x27], #0x4\n"
- "ld1 { v22.s }[2], [x25], #0x4\n"
- "ld1 { v26.s }[2], [x23], #0x4\n"
- "ld1 { v30.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ldr h11, [x28, #0x0]\n"
+ "ldr h15, [x24, #0x0]\n"
+ "ldr h19, [x23, #0x0]\n"
+ "ldr h23, [x22, #0x0]\n"
+ "ldr h27, [x21, #0x0]\n"
+ "ldr h31, [x20, #0x0]\n"
+ "b 264f\n"
+ "252:" // Height 6: Partial accumulate: partial_4_16
+ "tbz x11, #2, 254f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
+ "ldr d18, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
+ "tbz x11, #1, 253f\n"
+ "ld1 { v10.s }[2], [x28], #0x4\n"
"mov x19, #0x2c\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v10.h }[6], [x13]\n"
- "ld1 { v14.h }[6], [x9]\n"
- "ld1 { v18.h }[6], [x27]\n"
- "ld1 { v22.h }[6], [x25]\n"
- "ld1 { v26.h }[6], [x23]\n"
- "ld1 { v30.h }[6], [x21]\n"
- "b 270f\n"
- "259:" // Height 6: Partial accumulate: partial_1_20
+ "ld1 { v14.s }[2], [x24], #0x4\n"
+ "ld1 { v18.s }[2], [x23], #0x4\n"
+ "ld1 { v22.s }[2], [x22], #0x4\n"
+ "ld1 { v26.s }[2], [x21], #0x4\n"
+ "ld1 { v30.s }[2], [x20], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v10.h }[6], [x28]\n"
+ "ld1 { v14.h }[6], [x24]\n"
+ "ld1 { v18.h }[6], [x23]\n"
+ "ld1 { v22.h }[6], [x22]\n"
+ "ld1 { v26.h }[6], [x21]\n"
+ "ld1 { v30.h }[6], [x20]\n"
+ "b 264f\n"
+ "253:" // Height 6: Partial accumulate: partial_1_20
"mov x19, #0x28\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v10.h }[4], [x13]\n"
- "ld1 { v14.h }[4], [x9]\n"
- "ld1 { v18.h }[4], [x27]\n"
- "ld1 { v22.h }[4], [x25]\n"
- "ld1 { v26.h }[4], [x23]\n"
- "ld1 { v30.h }[4], [x21]\n"
- "b 270f\n"
- "260:" // Height 6: Partial accumulate: partial_2_16
- "tbz x16, #1, 261f\n"
- "ldr s10, [x13], #0x4\n"
- "ldr s14, [x9], #0x4\n"
- "ldr s18, [x27], #0x4\n"
- "ldr s22, [x25], #0x4\n"
- "ldr s26, [x23], #0x4\n"
- "ldr s30, [x21], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v10.h }[4], [x28]\n"
+ "ld1 { v14.h }[4], [x24]\n"
+ "ld1 { v18.h }[4], [x23]\n"
+ "ld1 { v22.h }[4], [x22]\n"
+ "ld1 { v26.h }[4], [x21]\n"
+ "ld1 { v30.h }[4], [x20]\n"
+ "b 264f\n"
+ "254:" // Height 6: Partial accumulate: partial_2_16
+ "tbz x11, #1, 255f\n"
+ "ldr s10, [x28], #0x4\n"
+ "ldr s14, [x24], #0x4\n"
"mov x19, #0x24\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v10.h }[2], [x13]\n"
- "ld1 { v14.h }[2], [x9]\n"
- "ld1 { v18.h }[2], [x27]\n"
- "ld1 { v22.h }[2], [x25]\n"
- "ld1 { v26.h }[2], [x23]\n"
- "ld1 { v30.h }[2], [x21]\n"
- "b 270f\n"
- "261:" // Height 6: Partial accumulate: partial_1_16
+ "ldr s18, [x23], #0x4\n"
+ "ldr s22, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "ldr s30, [x20], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v10.h }[2], [x28]\n"
+ "ld1 { v14.h }[2], [x24]\n"
+ "ld1 { v18.h }[2], [x23]\n"
+ "ld1 { v22.h }[2], [x22]\n"
+ "ld1 { v26.h }[2], [x21]\n"
+ "ld1 { v30.h }[2], [x20]\n"
+ "b 264f\n"
+ "255:" // Height 6: Partial accumulate: partial_1_16
"mov x19, #0x20\n"
- "tbz x16, #0, 270f\n"
- "ldr h10, [x13, #0x0]\n"
- "ldr h14, [x9, #0x0]\n"
- "ldr h18, [x27, #0x0]\n"
- "ldr h22, [x25, #0x0]\n"
- "ldr h26, [x23, #0x0]\n"
- "ldr h30, [x21, #0x0]\n"
- "b 270f\n"
- "262:" // Height 6: Partial accumulate: partial_8_0
- "tbz x16, #3, 266f\n"
- "ld1 { v8.8h }, [x13], #0x10\n"
- "ld1 { v12.8h }, [x9], #0x10\n"
- "ld1 { v16.8h }, [x27], #0x10\n"
- "ld1 { v20.8h }, [x25], #0x10\n"
- "ld1 { v24.8h }, [x23], #0x10\n"
- "ld1 { v28.8h }, [x21], #0x10\n"
- "tbz x16, #2, 264f\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x21], #0x8\n"
- "tbz x16, #1, 263f\n"
- "ld1 { v9.s }[2], [x13], #0x4\n"
- "ld1 { v13.s }[2], [x9], #0x4\n"
- "ld1 { v17.s }[2], [x27], #0x4\n"
- "ld1 { v21.s }[2], [x25], #0x4\n"
- "ld1 { v25.s }[2], [x23], #0x4\n"
- "ld1 { v29.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ldr h10, [x28, #0x0]\n"
+ "ldr h14, [x24, #0x0]\n"
+ "ldr h18, [x23, #0x0]\n"
+ "ldr h22, [x22, #0x0]\n"
+ "ldr h26, [x21, #0x0]\n"
+ "ldr h30, [x20, #0x0]\n"
+ "b 264f\n"
+ "256:" // Height 6: Partial accumulate: partial_8_0
+ "tbz x11, #3, 260f\n"
+ "ld1 { v8.8h }, [x28], #0x10\n"
+ "ld1 { v12.8h }, [x24], #0x10\n"
+ "ld1 { v16.8h }, [x23], #0x10\n"
+ "ld1 { v20.8h }, [x22], #0x10\n"
+ "ld1 { v24.8h }, [x21], #0x10\n"
+ "ld1 { v28.8h }, [x20], #0x10\n"
+ "tbz x11, #2, 258f\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d25, [x21], #0x8\n"
+ "ldr d29, [x20], #0x8\n"
+ "tbz x11, #1, 257f\n"
+ "ld1 { v9.s }[2], [x28], #0x4\n"
"mov x19, #0x1c\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v9.h }[6], [x13]\n"
- "ld1 { v13.h }[6], [x9]\n"
- "ld1 { v17.h }[6], [x27]\n"
- "ld1 { v21.h }[6], [x25]\n"
- "ld1 { v25.h }[6], [x23]\n"
- "ld1 { v29.h }[6], [x21]\n"
- "b 270f\n"
- "263:" // Height 6: Partial accumulate: partial_1_12
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v17.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v25.s }[2], [x21], #0x4\n"
+ "ld1 { v29.s }[2], [x20], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v9.h }[6], [x28]\n"
+ "ld1 { v13.h }[6], [x24]\n"
+ "ld1 { v17.h }[6], [x23]\n"
+ "ld1 { v21.h }[6], [x22]\n"
+ "ld1 { v25.h }[6], [x21]\n"
+ "ld1 { v29.h }[6], [x20]\n"
+ "b 264f\n"
+ "257:" // Height 6: Partial accumulate: partial_1_12
"mov x19, #0x18\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v9.h }[4], [x13]\n"
- "ld1 { v13.h }[4], [x9]\n"
- "ld1 { v17.h }[4], [x27]\n"
- "ld1 { v21.h }[4], [x25]\n"
- "ld1 { v25.h }[4], [x23]\n"
- "ld1 { v29.h }[4], [x21]\n"
- "b 270f\n"
- "264:" // Height 6: Partial accumulate: partial_2_8
- "tbz x16, #1, 265f\n"
- "ldr s9, [x13], #0x4\n"
- "ldr s13, [x9], #0x4\n"
- "ldr s17, [x27], #0x4\n"
- "ldr s21, [x25], #0x4\n"
- "ldr s25, [x23], #0x4\n"
- "ldr s29, [x21], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v9.h }[4], [x28]\n"
+ "ld1 { v13.h }[4], [x24]\n"
+ "ld1 { v17.h }[4], [x23]\n"
+ "ld1 { v21.h }[4], [x22]\n"
+ "ld1 { v25.h }[4], [x21]\n"
+ "ld1 { v29.h }[4], [x20]\n"
+ "b 264f\n"
+ "258:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x11, #1, 259f\n"
+ "ldr s9, [x28], #0x4\n"
+ "ldr s13, [x24], #0x4\n"
"mov x19, #0x14\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v9.h }[2], [x13]\n"
- "ld1 { v13.h }[2], [x9]\n"
- "ld1 { v17.h }[2], [x27]\n"
- "ld1 { v21.h }[2], [x25]\n"
- "ld1 { v25.h }[2], [x23]\n"
- "ld1 { v29.h }[2], [x21]\n"
- "b 270f\n"
- "265:" // Height 6: Partial accumulate: partial_1_8
+ "ldr s17, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s25, [x21], #0x4\n"
+ "ldr s29, [x20], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v9.h }[2], [x28]\n"
+ "ld1 { v13.h }[2], [x24]\n"
+ "ld1 { v17.h }[2], [x23]\n"
+ "ld1 { v21.h }[2], [x22]\n"
+ "ld1 { v25.h }[2], [x21]\n"
+ "ld1 { v29.h }[2], [x20]\n"
+ "b 264f\n"
+ "259:" // Height 6: Partial accumulate: partial_1_8
"mov x19, #0x10\n"
- "tbz x16, #0, 270f\n"
- "ldr h9, [x13, #0x0]\n"
- "ldr h13, [x9, #0x0]\n"
- "ldr h17, [x27, #0x0]\n"
- "ldr h21, [x25, #0x0]\n"
- "ldr h25, [x23, #0x0]\n"
- "ldr h29, [x21, #0x0]\n"
- "b 270f\n"
- "266:" // Height 6: Partial accumulate: partial_4_0
- "tbz x16, #2, 268f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "tbz x16, #1, 267f\n"
- "ld1 { v8.s }[2], [x13], #0x4\n"
- "ld1 { v12.s }[2], [x9], #0x4\n"
- "ld1 { v16.s }[2], [x27], #0x4\n"
- "ld1 { v20.s }[2], [x25], #0x4\n"
- "ld1 { v24.s }[2], [x23], #0x4\n"
- "ld1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ldr h9, [x28, #0x0]\n"
+ "ldr h13, [x24, #0x0]\n"
+ "ldr h17, [x23, #0x0]\n"
+ "ldr h21, [x22, #0x0]\n"
+ "ldr h25, [x21, #0x0]\n"
+ "ldr h29, [x20, #0x0]\n"
+ "b 264f\n"
+ "260:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x11, #2, 262f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
+ "ldr d16, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "ldr d28, [x20], #0x8\n"
+ "tbz x11, #1, 261f\n"
+ "ld1 { v8.s }[2], [x28], #0x4\n"
"mov x19, #0xc\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v8.h }[6], [x13]\n"
- "ld1 { v12.h }[6], [x9]\n"
- "ld1 { v16.h }[6], [x27]\n"
- "ld1 { v20.h }[6], [x25]\n"
- "ld1 { v24.h }[6], [x23]\n"
- "ld1 { v28.h }[6], [x21]\n"
- "b 270f\n"
- "267:" // Height 6: Partial accumulate: partial_1_4
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v16.s }[2], [x23], #0x4\n"
+ "ld1 { v20.s }[2], [x22], #0x4\n"
+ "ld1 { v24.s }[2], [x21], #0x4\n"
+ "ld1 { v28.s }[2], [x20], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v8.h }[6], [x28]\n"
+ "ld1 { v12.h }[6], [x24]\n"
+ "ld1 { v16.h }[6], [x23]\n"
+ "ld1 { v20.h }[6], [x22]\n"
+ "ld1 { v24.h }[6], [x21]\n"
+ "ld1 { v28.h }[6], [x20]\n"
+ "b 264f\n"
+ "261:" // Height 6: Partial accumulate: partial_1_4
"mov x19, #0x8\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v8.h }[4], [x13]\n"
- "ld1 { v12.h }[4], [x9]\n"
- "ld1 { v16.h }[4], [x27]\n"
- "ld1 { v20.h }[4], [x25]\n"
- "ld1 { v24.h }[4], [x23]\n"
- "ld1 { v28.h }[4], [x21]\n"
- "b 270f\n"
- "268:" // Height 6: Partial accumulate: partial_2_0
- "tbz x16, #1, 269f\n"
- "ldr s8, [x13], #0x4\n"
- "ldr s12, [x9], #0x4\n"
- "ldr s16, [x27], #0x4\n"
- "ldr s20, [x25], #0x4\n"
- "ldr s24, [x23], #0x4\n"
- "ldr s28, [x21], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v8.h }[4], [x28]\n"
+ "ld1 { v12.h }[4], [x24]\n"
+ "ld1 { v16.h }[4], [x23]\n"
+ "ld1 { v20.h }[4], [x22]\n"
+ "ld1 { v24.h }[4], [x21]\n"
+ "ld1 { v28.h }[4], [x20]\n"
+ "b 264f\n"
+ "262:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x11, #1, 263f\n"
+ "ldr s8, [x28], #0x4\n"
+ "ldr s12, [x24], #0x4\n"
"mov x19, #0x4\n"
- "tbz x16, #0, 270f\n"
- "ld1 { v8.h }[2], [x13]\n"
- "ld1 { v12.h }[2], [x9]\n"
- "ld1 { v16.h }[2], [x27]\n"
- "ld1 { v20.h }[2], [x25]\n"
- "ld1 { v24.h }[2], [x23]\n"
- "ld1 { v28.h }[2], [x21]\n"
- "b 270f\n"
- "269:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s16, [x23], #0x4\n"
+ "ldr s20, [x22], #0x4\n"
+ "ldr s24, [x21], #0x4\n"
+ "ldr s28, [x20], #0x4\n"
+ "tbz x11, #0, 264f\n"
+ "ld1 { v8.h }[2], [x28]\n"
+ "ld1 { v12.h }[2], [x24]\n"
+ "ld1 { v16.h }[2], [x23]\n"
+ "ld1 { v20.h }[2], [x22]\n"
+ "ld1 { v24.h }[2], [x21]\n"
+ "ld1 { v28.h }[2], [x20]\n"
+ "b 264f\n"
+ "263:" // Height 6: Partial accumulate: partial_1_0
+ "ldr h8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr h8, [x13, #0x0]\n"
- "ldr h12, [x9, #0x0]\n"
- "ldr h16, [x27, #0x0]\n"
- "ldr h20, [x25, #0x0]\n"
- "ldr h24, [x23, #0x0]\n"
- "ldr h28, [x21, #0x0]\n"
- "270:" // Height 6: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "sub x21, x21, x19\n"
- "b 273f\n"
- "271:" // Height 6: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x21, #0x0]\n"
- "ldr q29, [x21, #0x10]\n"
- "ldr q30, [x21, #0x20]\n"
- "ldr q31, [x21, #0x30]\n"
- "b 273f\n"
- "272:" // Height 6: no accumulate
+ "ldr h12, [x24, #0x0]\n"
+ "ldr h16, [x23, #0x0]\n"
+ "ldr h20, [x22, #0x0]\n"
+ "ldr h24, [x21, #0x0]\n"
+ "ldr h28, [x20, #0x0]\n"
+ "264:" // Height 6: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 267f\n"
+ "265:" // Height 6: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x20, #0x0]\n"
+ "ldr q29, [x20, #0x10]\n"
+ "ldr q30, [x20, #0x20]\n"
+ "ldr q31, [x20, #0x30]\n"
+ "b 267f\n"
+ "266:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -4454,523 +4384,523 @@ void a64_hybrid_fp16_mla_6x32 (
"movi v29.16b, #0x0\n"
"movi v30.16b, #0x0\n"
"movi v31.16b, #0x0\n"
- "273:" // Height 6: setup done
- "mov x12, #0x0\n"
- "274:" // Height 6: String loop
+ "267:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "268:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 275f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 269f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 276f\n"
+ "cbnz x27, 270f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
"add x22, x22, x19, LSL #1\n"
+ "add x21, x21, x19, LSL #1\n"
"add x20, x20, x19, LSL #1\n"
- "b 276f\n"
- "275:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "add x22, x24, x19, LSL #1\n"
- "add x20, x22, x19, LSL #1\n"
- "276:" // Height 6: input setup done
- "cmp x11, #0x8\n"
- "blt 279f\n"
- "cmp x11, #0x10\n"
- "blt 278f\n"
- "277:" // Height 6: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
+ "b 270f\n"
+ "269:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "add x20, x21, x19, LSL #1\n"
+ "270:" // Height 6: input setup done
+ "cmp x26, #0x8\n"
+ "blt 273f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x10\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 272f\n"
+ "271:" // Height 6: Multiply loop: Main loop head
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x21, x21, #0x10\n"
"fmla v28.8h, v6.8h, v5.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x20, x20, #0x10\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "sub x26, x26, #0x8\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
- "add x20, x20, #0x10\n"
- "fmla v17.8h, v7.8h, v2.h[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "sub x11, x11, #0x8\n"
+ "cmp x26, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
- "cmp x11, #0x10\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
"fmla v29.8h, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
"fmla v26.8h, v6.8h, v4.h[0]\n"
"fmla v30.8h, v6.8h, v5.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
"fmla v27.8h, v7.8h, v4.h[0]\n"
"fmla v31.8h, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
"fmla v20.8h, v6.8h, v3.h[1]\n"
"fmla v24.8h, v6.8h, v4.h[1]\n"
"fmla v28.8h, v6.8h, v5.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
"fmla v21.8h, v7.8h, v3.h[1]\n"
"fmla v25.8h, v7.8h, v4.h[1]\n"
"fmla v29.8h, v7.8h, v5.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
"fmla v22.8h, v6.8h, v3.h[1]\n"
"fmla v26.8h, v6.8h, v4.h[1]\n"
"fmla v30.8h, v6.8h, v5.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
"fmla v23.8h, v7.8h, v3.h[1]\n"
"fmla v27.8h, v7.8h, v4.h[1]\n"
"fmla v31.8h, v7.8h, v5.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
"fmla v20.8h, v6.8h, v3.h[2]\n"
"fmla v24.8h, v6.8h, v4.h[2]\n"
"fmla v28.8h, v6.8h, v5.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
"fmla v21.8h, v7.8h, v3.h[2]\n"
"fmla v25.8h, v7.8h, v4.h[2]\n"
"fmla v29.8h, v7.8h, v5.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
"fmla v22.8h, v6.8h, v3.h[2]\n"
"fmla v26.8h, v6.8h, v4.h[2]\n"
"fmla v30.8h, v6.8h, v5.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
"fmla v23.8h, v7.8h, v3.h[2]\n"
"fmla v27.8h, v7.8h, v4.h[2]\n"
"fmla v31.8h, v7.8h, v5.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
"fmla v20.8h, v6.8h, v3.h[3]\n"
"fmla v24.8h, v6.8h, v4.h[3]\n"
"fmla v28.8h, v6.8h, v5.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
"fmla v21.8h, v7.8h, v3.h[3]\n"
"fmla v25.8h, v7.8h, v4.h[3]\n"
"fmla v29.8h, v7.8h, v5.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
"fmla v22.8h, v6.8h, v3.h[3]\n"
"fmla v26.8h, v6.8h, v4.h[3]\n"
"fmla v30.8h, v6.8h, v5.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
"fmla v23.8h, v7.8h, v3.h[3]\n"
"fmla v27.8h, v7.8h, v4.h[3]\n"
"fmla v31.8h, v7.8h, v5.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
"fmla v20.8h, v6.8h, v3.h[4]\n"
"fmla v24.8h, v6.8h, v4.h[4]\n"
"fmla v28.8h, v6.8h, v5.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
"fmla v21.8h, v7.8h, v3.h[4]\n"
"fmla v25.8h, v7.8h, v4.h[4]\n"
"fmla v29.8h, v7.8h, v5.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
"fmla v22.8h, v6.8h, v3.h[4]\n"
"fmla v26.8h, v6.8h, v4.h[4]\n"
"fmla v30.8h, v6.8h, v5.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
"fmla v23.8h, v7.8h, v3.h[4]\n"
"fmla v27.8h, v7.8h, v4.h[4]\n"
"fmla v31.8h, v7.8h, v5.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
"fmla v20.8h, v6.8h, v3.h[5]\n"
"fmla v24.8h, v6.8h, v4.h[5]\n"
"fmla v28.8h, v6.8h, v5.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
"fmla v21.8h, v7.8h, v3.h[5]\n"
"fmla v25.8h, v7.8h, v4.h[5]\n"
"fmla v29.8h, v7.8h, v5.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
"fmla v22.8h, v6.8h, v3.h[5]\n"
"fmla v26.8h, v6.8h, v4.h[5]\n"
"fmla v30.8h, v6.8h, v5.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
"fmla v23.8h, v7.8h, v3.h[5]\n"
"fmla v27.8h, v7.8h, v4.h[5]\n"
"fmla v31.8h, v7.8h, v5.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
"fmla v20.8h, v6.8h, v3.h[6]\n"
"fmla v24.8h, v6.8h, v4.h[6]\n"
"fmla v28.8h, v6.8h, v5.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
"fmla v21.8h, v7.8h, v3.h[6]\n"
"fmla v25.8h, v7.8h, v4.h[6]\n"
"fmla v29.8h, v7.8h, v5.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
"fmla v22.8h, v6.8h, v3.h[6]\n"
"fmla v26.8h, v6.8h, v4.h[6]\n"
"fmla v30.8h, v6.8h, v5.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
"fmla v23.8h, v7.8h, v3.h[6]\n"
"fmla v27.8h, v7.8h, v4.h[6]\n"
"fmla v31.8h, v7.8h, v5.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
"fmla v20.8h, v6.8h, v3.h[7]\n"
"fmla v24.8h, v6.8h, v4.h[7]\n"
"fmla v28.8h, v6.8h, v5.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
"fmla v21.8h, v7.8h, v3.h[7]\n"
"fmla v25.8h, v7.8h, v4.h[7]\n"
"fmla v29.8h, v7.8h, v5.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
- "add x15, x15, #0x200\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
"fmla v22.8h, v6.8h, v3.h[7]\n"
"fmla v26.8h, v6.8h, v4.h[7]\n"
"fmla v30.8h, v6.8h, v5.h[7]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.8h, v7.8h, v0.h[7]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.8h, v7.8h, v1.h[7]\n"
+ "ldr q1, [x24, #0x0]\n"
"fmla v19.8h, v7.8h, v2.h[7]\n"
+ "ldr q2, [x23, #0x0]\n"
"fmla v23.8h, v7.8h, v3.h[7]\n"
+ "ldr q3, [x22, #0x0]\n"
"fmla v27.8h, v7.8h, v4.h[7]\n"
+ "ldr q4, [x21, #0x0]\n"
"fmla v31.8h, v7.8h, v5.h[7]\n"
- "bge 277b\n"
- "278:" // Height 6: Multiply loop: Single iteration only
- "sub x11, x11, #0x8\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "bge 271b\n"
+ "272:" // Height 6: Multiply loop: Single iteration only
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x8\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
- "add x28, x28, #0x10\n"
- "fmla v20.8h, v6.8h, v3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- "fmla v24.8h, v6.8h, v4.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"add x24, x24, #0x10\n"
- "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
- "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"add x22, x22, #0x10\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x21, x21, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q6, [x10, #0x20]\n"
"add x20, x20, #0x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
"fmla v29.8h, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
"fmla v26.8h, v6.8h, v4.h[0]\n"
"fmla v30.8h, v6.8h, v5.h[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.8h, v7.8h, v0.h[0]\n"
"fmla v15.8h, v7.8h, v1.h[0]\n"
"fmla v19.8h, v7.8h, v2.h[0]\n"
"fmla v23.8h, v7.8h, v3.h[0]\n"
"fmla v27.8h, v7.8h, v4.h[0]\n"
"fmla v31.8h, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.8h, v6.8h, v0.h[1]\n"
"fmla v12.8h, v6.8h, v1.h[1]\n"
"fmla v16.8h, v6.8h, v2.h[1]\n"
"fmla v20.8h, v6.8h, v3.h[1]\n"
"fmla v24.8h, v6.8h, v4.h[1]\n"
"fmla v28.8h, v6.8h, v5.h[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.8h, v7.8h, v0.h[1]\n"
"fmla v13.8h, v7.8h, v1.h[1]\n"
"fmla v17.8h, v7.8h, v2.h[1]\n"
"fmla v21.8h, v7.8h, v3.h[1]\n"
"fmla v25.8h, v7.8h, v4.h[1]\n"
"fmla v29.8h, v7.8h, v5.h[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.8h, v6.8h, v0.h[1]\n"
"fmla v14.8h, v6.8h, v1.h[1]\n"
"fmla v18.8h, v6.8h, v2.h[1]\n"
"fmla v22.8h, v6.8h, v3.h[1]\n"
"fmla v26.8h, v6.8h, v4.h[1]\n"
"fmla v30.8h, v6.8h, v5.h[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.8h, v7.8h, v0.h[1]\n"
"fmla v15.8h, v7.8h, v1.h[1]\n"
"fmla v19.8h, v7.8h, v2.h[1]\n"
"fmla v23.8h, v7.8h, v3.h[1]\n"
"fmla v27.8h, v7.8h, v4.h[1]\n"
"fmla v31.8h, v7.8h, v5.h[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.8h, v6.8h, v0.h[2]\n"
"fmla v12.8h, v6.8h, v1.h[2]\n"
"fmla v16.8h, v6.8h, v2.h[2]\n"
"fmla v20.8h, v6.8h, v3.h[2]\n"
"fmla v24.8h, v6.8h, v4.h[2]\n"
"fmla v28.8h, v6.8h, v5.h[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.8h, v7.8h, v0.h[2]\n"
"fmla v13.8h, v7.8h, v1.h[2]\n"
"fmla v17.8h, v7.8h, v2.h[2]\n"
"fmla v21.8h, v7.8h, v3.h[2]\n"
"fmla v25.8h, v7.8h, v4.h[2]\n"
"fmla v29.8h, v7.8h, v5.h[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.8h, v6.8h, v0.h[2]\n"
"fmla v14.8h, v6.8h, v1.h[2]\n"
"fmla v18.8h, v6.8h, v2.h[2]\n"
"fmla v22.8h, v6.8h, v3.h[2]\n"
"fmla v26.8h, v6.8h, v4.h[2]\n"
"fmla v30.8h, v6.8h, v5.h[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.8h, v7.8h, v0.h[2]\n"
"fmla v15.8h, v7.8h, v1.h[2]\n"
"fmla v19.8h, v7.8h, v2.h[2]\n"
"fmla v23.8h, v7.8h, v3.h[2]\n"
"fmla v27.8h, v7.8h, v4.h[2]\n"
"fmla v31.8h, v7.8h, v5.h[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.8h, v6.8h, v0.h[3]\n"
"fmla v12.8h, v6.8h, v1.h[3]\n"
"fmla v16.8h, v6.8h, v2.h[3]\n"
"fmla v20.8h, v6.8h, v3.h[3]\n"
"fmla v24.8h, v6.8h, v4.h[3]\n"
"fmla v28.8h, v6.8h, v5.h[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.8h, v7.8h, v0.h[3]\n"
"fmla v13.8h, v7.8h, v1.h[3]\n"
"fmla v17.8h, v7.8h, v2.h[3]\n"
"fmla v21.8h, v7.8h, v3.h[3]\n"
"fmla v25.8h, v7.8h, v4.h[3]\n"
"fmla v29.8h, v7.8h, v5.h[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
"fmla v10.8h, v6.8h, v0.h[3]\n"
"fmla v14.8h, v6.8h, v1.h[3]\n"
"fmla v18.8h, v6.8h, v2.h[3]\n"
"fmla v22.8h, v6.8h, v3.h[3]\n"
"fmla v26.8h, v6.8h, v4.h[3]\n"
"fmla v30.8h, v6.8h, v5.h[3]\n"
- "ldr q6, [x15, #0x100]\n"
+ "ldr q6, [x10, #0x100]\n"
"fmla v11.8h, v7.8h, v0.h[3]\n"
"fmla v15.8h, v7.8h, v1.h[3]\n"
"fmla v19.8h, v7.8h, v2.h[3]\n"
"fmla v23.8h, v7.8h, v3.h[3]\n"
"fmla v27.8h, v7.8h, v4.h[3]\n"
"fmla v31.8h, v7.8h, v5.h[3]\n"
- "ldr q7, [x15, #0x110]\n"
+ "ldr q7, [x10, #0x110]\n"
"fmla v8.8h, v6.8h, v0.h[4]\n"
"fmla v12.8h, v6.8h, v1.h[4]\n"
"fmla v16.8h, v6.8h, v2.h[4]\n"
"fmla v20.8h, v6.8h, v3.h[4]\n"
"fmla v24.8h, v6.8h, v4.h[4]\n"
"fmla v28.8h, v6.8h, v5.h[4]\n"
- "ldr q6, [x15, #0x120]\n"
+ "ldr q6, [x10, #0x120]\n"
"fmla v9.8h, v7.8h, v0.h[4]\n"
"fmla v13.8h, v7.8h, v1.h[4]\n"
"fmla v17.8h, v7.8h, v2.h[4]\n"
"fmla v21.8h, v7.8h, v3.h[4]\n"
"fmla v25.8h, v7.8h, v4.h[4]\n"
"fmla v29.8h, v7.8h, v5.h[4]\n"
- "ldr q7, [x15, #0x130]\n"
+ "ldr q7, [x10, #0x130]\n"
"fmla v10.8h, v6.8h, v0.h[4]\n"
"fmla v14.8h, v6.8h, v1.h[4]\n"
"fmla v18.8h, v6.8h, v2.h[4]\n"
"fmla v22.8h, v6.8h, v3.h[4]\n"
"fmla v26.8h, v6.8h, v4.h[4]\n"
"fmla v30.8h, v6.8h, v5.h[4]\n"
- "ldr q6, [x15, #0x140]\n"
+ "ldr q6, [x10, #0x140]\n"
"fmla v11.8h, v7.8h, v0.h[4]\n"
"fmla v15.8h, v7.8h, v1.h[4]\n"
"fmla v19.8h, v7.8h, v2.h[4]\n"
"fmla v23.8h, v7.8h, v3.h[4]\n"
"fmla v27.8h, v7.8h, v4.h[4]\n"
"fmla v31.8h, v7.8h, v5.h[4]\n"
- "ldr q7, [x15, #0x150]\n"
+ "ldr q7, [x10, #0x150]\n"
"fmla v8.8h, v6.8h, v0.h[5]\n"
"fmla v12.8h, v6.8h, v1.h[5]\n"
"fmla v16.8h, v6.8h, v2.h[5]\n"
"fmla v20.8h, v6.8h, v3.h[5]\n"
"fmla v24.8h, v6.8h, v4.h[5]\n"
"fmla v28.8h, v6.8h, v5.h[5]\n"
- "ldr q6, [x15, #0x160]\n"
+ "ldr q6, [x10, #0x160]\n"
"fmla v9.8h, v7.8h, v0.h[5]\n"
"fmla v13.8h, v7.8h, v1.h[5]\n"
"fmla v17.8h, v7.8h, v2.h[5]\n"
"fmla v21.8h, v7.8h, v3.h[5]\n"
"fmla v25.8h, v7.8h, v4.h[5]\n"
"fmla v29.8h, v7.8h, v5.h[5]\n"
- "ldr q7, [x15, #0x170]\n"
+ "ldr q7, [x10, #0x170]\n"
"fmla v10.8h, v6.8h, v0.h[5]\n"
"fmla v14.8h, v6.8h, v1.h[5]\n"
"fmla v18.8h, v6.8h, v2.h[5]\n"
"fmla v22.8h, v6.8h, v3.h[5]\n"
"fmla v26.8h, v6.8h, v4.h[5]\n"
"fmla v30.8h, v6.8h, v5.h[5]\n"
- "ldr q6, [x15, #0x180]\n"
+ "ldr q6, [x10, #0x180]\n"
"fmla v11.8h, v7.8h, v0.h[5]\n"
"fmla v15.8h, v7.8h, v1.h[5]\n"
"fmla v19.8h, v7.8h, v2.h[5]\n"
"fmla v23.8h, v7.8h, v3.h[5]\n"
"fmla v27.8h, v7.8h, v4.h[5]\n"
"fmla v31.8h, v7.8h, v5.h[5]\n"
- "ldr q7, [x15, #0x190]\n"
+ "ldr q7, [x10, #0x190]\n"
"fmla v8.8h, v6.8h, v0.h[6]\n"
"fmla v12.8h, v6.8h, v1.h[6]\n"
"fmla v16.8h, v6.8h, v2.h[6]\n"
"fmla v20.8h, v6.8h, v3.h[6]\n"
"fmla v24.8h, v6.8h, v4.h[6]\n"
"fmla v28.8h, v6.8h, v5.h[6]\n"
- "ldr q6, [x15, #0x1a0]\n"
+ "ldr q6, [x10, #0x1a0]\n"
"fmla v9.8h, v7.8h, v0.h[6]\n"
"fmla v13.8h, v7.8h, v1.h[6]\n"
"fmla v17.8h, v7.8h, v2.h[6]\n"
"fmla v21.8h, v7.8h, v3.h[6]\n"
"fmla v25.8h, v7.8h, v4.h[6]\n"
"fmla v29.8h, v7.8h, v5.h[6]\n"
- "ldr q7, [x15, #0x1b0]\n"
+ "ldr q7, [x10, #0x1b0]\n"
"fmla v10.8h, v6.8h, v0.h[6]\n"
"fmla v14.8h, v6.8h, v1.h[6]\n"
"fmla v18.8h, v6.8h, v2.h[6]\n"
"fmla v22.8h, v6.8h, v3.h[6]\n"
"fmla v26.8h, v6.8h, v4.h[6]\n"
"fmla v30.8h, v6.8h, v5.h[6]\n"
- "ldr q6, [x15, #0x1c0]\n"
+ "ldr q6, [x10, #0x1c0]\n"
"fmla v11.8h, v7.8h, v0.h[6]\n"
"fmla v15.8h, v7.8h, v1.h[6]\n"
"fmla v19.8h, v7.8h, v2.h[6]\n"
"fmla v23.8h, v7.8h, v3.h[6]\n"
"fmla v27.8h, v7.8h, v4.h[6]\n"
"fmla v31.8h, v7.8h, v5.h[6]\n"
- "ldr q7, [x15, #0x1d0]\n"
+ "ldr q7, [x10, #0x1d0]\n"
"fmla v8.8h, v6.8h, v0.h[7]\n"
"fmla v12.8h, v6.8h, v1.h[7]\n"
"fmla v16.8h, v6.8h, v2.h[7]\n"
"fmla v20.8h, v6.8h, v3.h[7]\n"
"fmla v24.8h, v6.8h, v4.h[7]\n"
"fmla v28.8h, v6.8h, v5.h[7]\n"
- "ldr q6, [x15, #0x1e0]\n"
+ "ldr q6, [x10, #0x1e0]\n"
"fmla v9.8h, v7.8h, v0.h[7]\n"
"fmla v13.8h, v7.8h, v1.h[7]\n"
"fmla v17.8h, v7.8h, v2.h[7]\n"
"fmla v21.8h, v7.8h, v3.h[7]\n"
"fmla v25.8h, v7.8h, v4.h[7]\n"
"fmla v29.8h, v7.8h, v5.h[7]\n"
- "ldr q7, [x15, #0x1f0]\n"
+ "ldr q7, [x10, #0x1f0]\n"
+ "add x10, x10, #0x200\n"
"fmla v10.8h, v6.8h, v0.h[7]\n"
- "add x15, x15, #0x200\n"
"fmla v14.8h, v6.8h, v1.h[7]\n"
"fmla v18.8h, v6.8h, v2.h[7]\n"
"fmla v22.8h, v6.8h, v3.h[7]\n"
@@ -4982,34 +4912,34 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v23.8h, v7.8h, v3.h[7]\n"
"fmla v27.8h, v7.8h, v4.h[7]\n"
"fmla v31.8h, v7.8h, v5.h[7]\n"
- "279:" // Height 6: Multiply loop: Main loop skip
- "cbz x11, 281f\n"
- "280:" // Height 6: Multiply loop: Odd block loop
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x22], #0x2\n"
+ "273:" // Height 6: Multiply loop: Main loop skip
+ "cbz x26, 275f\n"
+ "274:" // Height 6: Multiply loop: Odd block loop
+ "ldr h0, [x25], #0x2\n"
+ "sub x26, x26, #0x1\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h4, [x21], #0x2\n"
"ldr h5, [x20], #0x2\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.8h, v6.8h, v0.h[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.8h, v6.8h, v1.h[0]\n"
- "sub x11, x11, #0x1\n"
"fmla v16.8h, v6.8h, v2.h[0]\n"
"fmla v20.8h, v6.8h, v3.h[0]\n"
"fmla v24.8h, v6.8h, v4.h[0]\n"
"fmla v28.8h, v6.8h, v5.h[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.8h, v7.8h, v0.h[0]\n"
"fmla v13.8h, v7.8h, v1.h[0]\n"
"fmla v17.8h, v7.8h, v2.h[0]\n"
"fmla v21.8h, v7.8h, v3.h[0]\n"
"fmla v25.8h, v7.8h, v4.h[0]\n"
"fmla v29.8h, v7.8h, v5.h[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.8h, v6.8h, v0.h[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.8h, v6.8h, v1.h[0]\n"
"fmla v18.8h, v6.8h, v2.h[0]\n"
"fmla v22.8h, v6.8h, v3.h[0]\n"
@@ -5021,19 +4951,25 @@ void a64_hybrid_fp16_mla_6x32 (
"fmla v23.8h, v7.8h, v3.h[0]\n"
"fmla v27.8h, v7.8h, v4.h[0]\n"
"fmla v31.8h, v7.8h, v5.h[0]\n"
- "cbnz x11, 280b\n"
- "281:" // Height 6: Multiply loop: No odd multiplies
+ "cbnz x26, 274b\n"
+ "275:" // Height 6: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 274b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 268b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #1\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #1\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #1\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "tbz %x[flags], #1, 282f\n"
+ "add x20, x21, x19, LSL #1\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "tbz %x[flags], #1, 276f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.8h }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -5086,313 +5022,308 @@ void a64_hybrid_fp16_mla_6x32 (
"fmin v31.8h, v31.8h, v0.8h\n"
"fmax v30.8h, v30.8h, v1.8h\n"
"fmax v31.8h, v31.8h, v1.8h\n"
- "282:" // Height 6: No activation
- "cmp x16, #0x20\n"
- "bge 299f\n"
- "tbz x16, #4, 290f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v9.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "st1 { v13.8h }, [x9], #0x10\n"
- "st1 { v16.8h }, [x27], #0x10\n"
- "st1 { v17.8h }, [x27], #0x10\n"
- "st1 { v20.8h }, [x25], #0x10\n"
- "st1 { v21.8h }, [x25], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v25.8h }, [x23], #0x10\n"
- "st1 { v28.8h }, [x21], #0x10\n"
- "st1 { v29.8h }, [x21], #0x10\n"
- "tbz x16, #3, 286f\n"
- "st1 { v10.8h }, [x13], #0x10\n"
- "st1 { v14.8h }, [x9], #0x10\n"
- "st1 { v18.8h }, [x27], #0x10\n"
- "st1 { v22.8h }, [x25], #0x10\n"
- "st1 { v26.8h }, [x23], #0x10\n"
- "st1 { v30.8h }, [x21], #0x10\n"
- "tbz x16, #2, 284f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x21], #0x8\n"
- "tbz x16, #1, 283f\n"
- "st1 { v11.s }[2], [x13], #0x4\n"
- "st1 { v15.s }[2], [x9], #0x4\n"
- "st1 { v19.s }[2], [x27], #0x4\n"
- "st1 { v23.s }[2], [x25], #0x4\n"
- "st1 { v27.s }[2], [x23], #0x4\n"
- "st1 { v31.s }[2], [x21], #0x4\n"
- "tbz x16, #0, 298f\n"
- "st1 { v11.h }[6], [x13]\n"
- "st1 { v15.h }[6], [x9]\n"
- "st1 { v19.h }[6], [x27]\n"
- "st1 { v23.h }[6], [x25]\n"
- "st1 { v27.h }[6], [x23]\n"
- "st1 { v31.h }[6], [x21]\n"
- "b 298f\n"
- "283:" // Height 6: Partial direct writeback: partial_1_28
- "tbz x16, #0, 298f\n"
- "st1 { v11.h }[4], [x13]\n"
- "st1 { v15.h }[4], [x9]\n"
- "st1 { v19.h }[4], [x27]\n"
- "st1 { v23.h }[4], [x25]\n"
- "st1 { v27.h }[4], [x23]\n"
- "st1 { v31.h }[4], [x21]\n"
- "b 298f\n"
- "284:" // Height 6: Partial direct writeback: partial_2_24
- "tbz x16, #1, 285f\n"
- "str s11, [x13], #0x4\n"
- "str s15, [x9], #0x4\n"
- "str s19, [x27], #0x4\n"
- "str s23, [x25], #0x4\n"
- "str s27, [x23], #0x4\n"
- "str s31, [x21], #0x4\n"
- "tbz x16, #0, 298f\n"
- "st1 { v11.h }[2], [x13]\n"
- "st1 { v15.h }[2], [x9]\n"
- "st1 { v19.h }[2], [x27]\n"
- "st1 { v23.h }[2], [x25]\n"
- "st1 { v27.h }[2], [x23]\n"
- "st1 { v31.h }[2], [x21]\n"
- "b 298f\n"
- "285:" // Height 6: Partial direct writeback: partial_1_24
- "tbz x16, #0, 298f\n"
- "str h11, [x13, #0x0]\n"
- "str h15, [x9, #0x0]\n"
- "str h19, [x27, #0x0]\n"
- "str h23, [x25, #0x0]\n"
- "str h27, [x23, #0x0]\n"
- "str h31, [x21, #0x0]\n"
- "b 298f\n"
- "286:" // Height 6: Partial direct writeback: partial_4_16
- "tbz x16, #2, 288f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x21], #0x8\n"
- "tbz x16, #1, 287f\n"
- "st1 { v10.s }[2], [x13], #0x4\n"
- "st1 { v14.s }[2], [x9], #0x4\n"
- "st1 { v18.s }[2], [x27], #0x4\n"
- "st1 { v22.s }[2], [x25], #0x4\n"
- "st1 { v26.s }[2], [x23], #0x4\n"
- "st1 { v30.s }[2], [x21], #0x4\n"
- "tbz x16, #0, 298f\n"
- "st1 { v10.h }[6], [x13]\n"
- "st1 { v14.h }[6], [x9]\n"
- "st1 { v18.h }[6], [x27]\n"
- "st1 { v22.h }[6], [x25]\n"
- "st1 { v26.h }[6], [x23]\n"
- "st1 { v30.h }[6], [x21]\n"
- "b 298f\n"
- "287:" // Height 6: Partial direct writeback: partial_1_20
- "tbz x16, #0, 298f\n"
- "st1 { v10.h }[4], [x13]\n"
- "st1 { v14.h }[4], [x9]\n"
- "st1 { v18.h }[4], [x27]\n"
- "st1 { v22.h }[4], [x25]\n"
- "st1 { v26.h }[4], [x23]\n"
- "st1 { v30.h }[4], [x21]\n"
- "b 298f\n"
- "288:" // Height 6: Partial direct writeback: partial_2_16
- "tbz x16, #1, 289f\n"
- "str s10, [x13], #0x4\n"
- "str s14, [x9], #0x4\n"
- "str s18, [x27], #0x4\n"
- "str s22, [x25], #0x4\n"
- "str s26, [x23], #0x4\n"
- "str s30, [x21], #0x4\n"
- "tbz x16, #0, 298f\n"
- "st1 { v10.h }[2], [x13]\n"
- "st1 { v14.h }[2], [x9]\n"
- "st1 { v18.h }[2], [x27]\n"
- "st1 { v22.h }[2], [x25]\n"
- "st1 { v26.h }[2], [x23]\n"
- "st1 { v30.h }[2], [x21]\n"
- "b 298f\n"
- "289:" // Height 6: Partial direct writeback: partial_1_16
- "tbz x16, #0, 298f\n"
- "str h10, [x13, #0x0]\n"
- "str h14, [x9, #0x0]\n"
- "str h18, [x27, #0x0]\n"
- "str h22, [x25, #0x0]\n"
- "str h26, [x23, #0x0]\n"
- "str h30, [x21, #0x0]\n"
- "b 298f\n"
- "290:" // Height 6: Partial direct writeback: partial_8_0
- "tbz x16, #3, 294f\n"
- "st1 { v8.8h }, [x13], #0x10\n"
- "st1 { v12.8h }, [x9], #0x10\n"
- "st1 { v16.8h }, [x27], #0x10\n"
- "st1 { v20.8h }, [x25], #0x10\n"
- "st1 { v24.8h }, [x23], #0x10\n"
- "st1 { v28.8h }, [x21], #0x10\n"
- "tbz x16, #2, 292f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x21], #0x8\n"
- "tbz x16, #1, 291f\n"
- "st1 { v9.s }[2], [x13], #0x4\n"
- "st1 { v13.s }[2], [x9], #0x4\n"
- "st1 { v17.s }[2], [x27], #0x4\n"
- "st1 { v21.s }[2], [x25], #0x4\n"
- "st1 { v25.s }[2], [x23], #0x4\n"
- "st1 { v29.s }[2], [x21], #0x4\n"
- "tbz x16, #0, 298f\n"
- "st1 { v9.h }[6], [x13]\n"
- "st1 { v13.h }[6], [x9]\n"
- "st1 { v17.h }[6], [x27]\n"
- "st1 { v21.h }[6], [x25]\n"
- "st1 { v25.h }[6], [x23]\n"
- "st1 { v29.h }[6], [x21]\n"
- "b 298f\n"
- "291:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x16, #0, 298f\n"
- "st1 { v9.h }[4], [x13]\n"
- "st1 { v13.h }[4], [x9]\n"
- "st1 { v17.h }[4], [x27]\n"
- "st1 { v21.h }[4], [x25]\n"
- "st1 { v25.h }[4], [x23]\n"
- "st1 { v29.h }[4], [x21]\n"
- "b 298f\n"
- "292:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x16, #1, 293f\n"
- "str s9, [x13], #0x4\n"
- "str s13, [x9], #0x4\n"
- "str s17, [x27], #0x4\n"
- "str s21, [x25], #0x4\n"
- "str s25, [x23], #0x4\n"
- "str s29, [x21], #0x4\n"
- "tbz x16, #0, 298f\n"
- "st1 { v9.h }[2], [x13]\n"
- "st1 { v13.h }[2], [x9]\n"
- "st1 { v17.h }[2], [x27]\n"
- "st1 { v21.h }[2], [x25]\n"
- "st1 { v25.h }[2], [x23]\n"
- "st1 { v29.h }[2], [x21]\n"
- "b 298f\n"
- "293:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x16, #0, 298f\n"
- "str h9, [x13, #0x0]\n"
- "str h13, [x9, #0x0]\n"
- "str h17, [x27, #0x0]\n"
- "str h21, [x25, #0x0]\n"
- "str h25, [x23, #0x0]\n"
- "str h29, [x21, #0x0]\n"
- "b 298f\n"
- "294:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x16, #2, 296f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x21], #0x8\n"
- "tbz x16, #1, 295f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x21], #0x4\n"
- "tbz x16, #0, 298f\n"
- "st1 { v8.h }[6], [x13]\n"
- "st1 { v12.h }[6], [x9]\n"
- "st1 { v16.h }[6], [x27]\n"
- "st1 { v20.h }[6], [x25]\n"
- "st1 { v24.h }[6], [x23]\n"
- "st1 { v28.h }[6], [x21]\n"
- "b 298f\n"
- "295:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x16, #0, 298f\n"
- "st1 { v8.h }[4], [x13]\n"
- "st1 { v12.h }[4], [x9]\n"
- "st1 { v16.h }[4], [x27]\n"
- "st1 { v20.h }[4], [x25]\n"
- "st1 { v24.h }[4], [x23]\n"
- "st1 { v28.h }[4], [x21]\n"
- "b 298f\n"
- "296:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x16, #1, 297f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "str s16, [x27], #0x4\n"
- "str s20, [x25], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x21], #0x4\n"
- "tbz x16, #0, 298f\n"
- "st1 { v8.h }[2], [x13]\n"
- "st1 { v12.h }[2], [x9]\n"
- "st1 { v16.h }[2], [x27]\n"
- "st1 { v20.h }[2], [x25]\n"
- "st1 { v24.h }[2], [x23]\n"
- "st1 { v28.h }[2], [x21]\n"
- "b 298f\n"
- "297:" // Height 6: Partial direct writeback: partial_1_0
- "str h8, [x13, #0x0]\n"
- "str h12, [x9, #0x0]\n"
- "str h16, [x27, #0x0]\n"
- "str h20, [x25, #0x0]\n"
- "str h24, [x23, #0x0]\n"
- "str h28, [x21, #0x0]\n"
- "298:" // Height 6: Partial direct writeback: Done
- "b 300f\n"
- "299:" // Height 6: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x21, #0x0]\n"
- "str q29, [x21, #0x10]\n"
- "str q30, [x21, #0x20]\n"
- "str q31, [x21, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "add x21, x21, #0x40\n"
- "300:" // Height 6: Writeback done
- "subs x16, x16, #0x20\n"
- "bgt 253b\n"
+ "276:" // Height 6: No activation
+ "cmp x11, #0x20\n"
+ "bge 293f\n"
+ "tbz x11, #4, 284f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v9.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "st1 { v13.8h }, [x24], #0x10\n"
+ "st1 { v16.8h }, [x23], #0x10\n"
+ "st1 { v17.8h }, [x23], #0x10\n"
+ "st1 { v20.8h }, [x22], #0x10\n"
+ "st1 { v21.8h }, [x22], #0x10\n"
+ "st1 { v24.8h }, [x21], #0x10\n"
+ "st1 { v25.8h }, [x21], #0x10\n"
+ "st1 { v28.8h }, [x20], #0x10\n"
+ "st1 { v29.8h }, [x20], #0x10\n"
+ "tbz x11, #3, 280f\n"
+ "st1 { v10.8h }, [x28], #0x10\n"
+ "st1 { v14.8h }, [x24], #0x10\n"
+ "st1 { v18.8h }, [x23], #0x10\n"
+ "st1 { v22.8h }, [x22], #0x10\n"
+ "st1 { v26.8h }, [x21], #0x10\n"
+ "st1 { v30.8h }, [x20], #0x10\n"
+ "tbz x11, #2, 278f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
+ "str d31, [x20], #0x8\n"
+ "tbz x11, #1, 277f\n"
+ "st1 { v11.s }[2], [x28], #0x4\n"
+ "st1 { v15.s }[2], [x24], #0x4\n"
+ "st1 { v19.s }[2], [x23], #0x4\n"
+ "st1 { v23.s }[2], [x22], #0x4\n"
+ "st1 { v27.s }[2], [x21], #0x4\n"
+ "st1 { v31.s }[2], [x20], #0x4\n"
+ "tbz x11, #0, 292f\n"
+ "st1 { v11.h }[6], [x28]\n"
+ "st1 { v15.h }[6], [x24]\n"
+ "st1 { v19.h }[6], [x23]\n"
+ "st1 { v23.h }[6], [x22]\n"
+ "st1 { v27.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "b 292f\n"
+ "277:" // Height 6: Partial direct writeback: partial_1_28
+ "tbz x11, #0, 292f\n"
+ "st1 { v11.h }[4], [x28]\n"
+ "st1 { v15.h }[4], [x24]\n"
+ "st1 { v19.h }[4], [x23]\n"
+ "st1 { v23.h }[4], [x22]\n"
+ "st1 { v27.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "b 292f\n"
+ "278:" // Height 6: Partial direct writeback: partial_2_24
+ "tbz x11, #1, 279f\n"
+ "str s11, [x28], #0x4\n"
+ "str s15, [x24], #0x4\n"
+ "str s19, [x23], #0x4\n"
+ "str s23, [x22], #0x4\n"
+ "str s27, [x21], #0x4\n"
+ "str s31, [x20], #0x4\n"
+ "tbz x11, #0, 292f\n"
+ "st1 { v11.h }[2], [x28]\n"
+ "st1 { v15.h }[2], [x24]\n"
+ "st1 { v19.h }[2], [x23]\n"
+ "st1 { v23.h }[2], [x22]\n"
+ "st1 { v27.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "b 292f\n"
+ "279:" // Height 6: Partial direct writeback: partial_1_24
+ "tbz x11, #0, 292f\n"
+ "str h11, [x28, #0x0]\n"
+ "str h15, [x24, #0x0]\n"
+ "str h19, [x23, #0x0]\n"
+ "str h23, [x22, #0x0]\n"
+ "str h27, [x21, #0x0]\n"
+ "str h31, [x20, #0x0]\n"
+ "b 292f\n"
+ "280:" // Height 6: Partial direct writeback: partial_4_16
+ "tbz x11, #2, 282f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
+ "str d30, [x20], #0x8\n"
+ "tbz x11, #1, 281f\n"
+ "st1 { v10.s }[2], [x28], #0x4\n"
+ "st1 { v14.s }[2], [x24], #0x4\n"
+ "st1 { v18.s }[2], [x23], #0x4\n"
+ "st1 { v22.s }[2], [x22], #0x4\n"
+ "st1 { v26.s }[2], [x21], #0x4\n"
+ "st1 { v30.s }[2], [x20], #0x4\n"
+ "tbz x11, #0, 292f\n"
+ "st1 { v10.h }[6], [x28]\n"
+ "st1 { v14.h }[6], [x24]\n"
+ "st1 { v18.h }[6], [x23]\n"
+ "st1 { v22.h }[6], [x22]\n"
+ "st1 { v26.h }[6], [x21]\n"
+ "st1 { v30.h }[6], [x20]\n"
+ "b 292f\n"
+ "281:" // Height 6: Partial direct writeback: partial_1_20
+ "tbz x11, #0, 292f\n"
+ "st1 { v10.h }[4], [x28]\n"
+ "st1 { v14.h }[4], [x24]\n"
+ "st1 { v18.h }[4], [x23]\n"
+ "st1 { v22.h }[4], [x22]\n"
+ "st1 { v26.h }[4], [x21]\n"
+ "st1 { v30.h }[4], [x20]\n"
+ "b 292f\n"
+ "282:" // Height 6: Partial direct writeback: partial_2_16
+ "tbz x11, #1, 283f\n"
+ "str s10, [x28], #0x4\n"
+ "str s14, [x24], #0x4\n"
+ "str s18, [x23], #0x4\n"
+ "str s22, [x22], #0x4\n"
+ "str s26, [x21], #0x4\n"
+ "str s30, [x20], #0x4\n"
+ "tbz x11, #0, 292f\n"
+ "st1 { v10.h }[2], [x28]\n"
+ "st1 { v14.h }[2], [x24]\n"
+ "st1 { v18.h }[2], [x23]\n"
+ "st1 { v22.h }[2], [x22]\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "st1 { v30.h }[2], [x20]\n"
+ "b 292f\n"
+ "283:" // Height 6: Partial direct writeback: partial_1_16
+ "tbz x11, #0, 292f\n"
+ "str h10, [x28, #0x0]\n"
+ "str h14, [x24, #0x0]\n"
+ "str h18, [x23, #0x0]\n"
+ "str h22, [x22, #0x0]\n"
+ "str h26, [x21, #0x0]\n"
+ "str h30, [x20, #0x0]\n"
+ "b 292f\n"
+ "284:" // Height 6: Partial direct writeback: partial_8_0
+ "tbz x11, #3, 288f\n"
+ "st1 { v8.8h }, [x28], #0x10\n"
+ "st1 { v12.8h }, [x24], #0x10\n"
+ "st1 { v16.8h }, [x23], #0x10\n"
+ "st1 { v20.8h }, [x22], #0x10\n"
+ "st1 { v24.8h }, [x21], #0x10\n"
+ "st1 { v28.8h }, [x20], #0x10\n"
+ "tbz x11, #2, 286f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
+ "str d29, [x20], #0x8\n"
+ "tbz x11, #1, 285f\n"
+ "st1 { v9.s }[2], [x28], #0x4\n"
+ "st1 { v13.s }[2], [x24], #0x4\n"
+ "st1 { v17.s }[2], [x23], #0x4\n"
+ "st1 { v21.s }[2], [x22], #0x4\n"
+ "st1 { v25.s }[2], [x21], #0x4\n"
+ "st1 { v29.s }[2], [x20], #0x4\n"
+ "tbz x11, #0, 292f\n"
+ "st1 { v9.h }[6], [x28]\n"
+ "st1 { v13.h }[6], [x24]\n"
+ "st1 { v17.h }[6], [x23]\n"
+ "st1 { v21.h }[6], [x22]\n"
+ "st1 { v25.h }[6], [x21]\n"
+ "st1 { v29.h }[6], [x20]\n"
+ "b 292f\n"
+ "285:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 292f\n"
+ "st1 { v9.h }[4], [x28]\n"
+ "st1 { v13.h }[4], [x24]\n"
+ "st1 { v17.h }[4], [x23]\n"
+ "st1 { v21.h }[4], [x22]\n"
+ "st1 { v25.h }[4], [x21]\n"
+ "st1 { v29.h }[4], [x20]\n"
+ "b 292f\n"
+ "286:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 287f\n"
+ "str s9, [x28], #0x4\n"
+ "str s13, [x24], #0x4\n"
+ "str s17, [x23], #0x4\n"
+ "str s21, [x22], #0x4\n"
+ "str s25, [x21], #0x4\n"
+ "str s29, [x20], #0x4\n"
+ "tbz x11, #0, 292f\n"
+ "st1 { v9.h }[2], [x28]\n"
+ "st1 { v13.h }[2], [x24]\n"
+ "st1 { v17.h }[2], [x23]\n"
+ "st1 { v21.h }[2], [x22]\n"
+ "st1 { v25.h }[2], [x21]\n"
+ "st1 { v29.h }[2], [x20]\n"
+ "b 292f\n"
+ "287:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 292f\n"
+ "str h9, [x28, #0x0]\n"
+ "str h13, [x24, #0x0]\n"
+ "str h17, [x23, #0x0]\n"
+ "str h21, [x22, #0x0]\n"
+ "str h25, [x21, #0x0]\n"
+ "str h29, [x20, #0x0]\n"
+ "b 292f\n"
+ "288:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 290f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
+ "str d28, [x20], #0x8\n"
+ "tbz x11, #1, 289f\n"
+ "st1 { v8.s }[2], [x28], #0x4\n"
+ "st1 { v12.s }[2], [x24], #0x4\n"
+ "st1 { v16.s }[2], [x23], #0x4\n"
+ "st1 { v20.s }[2], [x22], #0x4\n"
+ "st1 { v24.s }[2], [x21], #0x4\n"
+ "st1 { v28.s }[2], [x20], #0x4\n"
+ "tbz x11, #0, 292f\n"
+ "st1 { v8.h }[6], [x28]\n"
+ "st1 { v12.h }[6], [x24]\n"
+ "st1 { v16.h }[6], [x23]\n"
+ "st1 { v20.h }[6], [x22]\n"
+ "st1 { v24.h }[6], [x21]\n"
+ "st1 { v28.h }[6], [x20]\n"
+ "b 292f\n"
+ "289:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 292f\n"
+ "st1 { v8.h }[4], [x28]\n"
+ "st1 { v12.h }[4], [x24]\n"
+ "st1 { v16.h }[4], [x23]\n"
+ "st1 { v20.h }[4], [x22]\n"
+ "st1 { v24.h }[4], [x21]\n"
+ "st1 { v28.h }[4], [x20]\n"
+ "b 292f\n"
+ "290:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 291f\n"
+ "str s8, [x28], #0x4\n"
+ "str s12, [x24], #0x4\n"
+ "str s16, [x23], #0x4\n"
+ "str s20, [x22], #0x4\n"
+ "str s24, [x21], #0x4\n"
+ "str s28, [x20], #0x4\n"
+ "tbz x11, #0, 292f\n"
+ "st1 { v8.h }[2], [x28]\n"
+ "st1 { v12.h }[2], [x24]\n"
+ "st1 { v16.h }[2], [x23]\n"
+ "st1 { v20.h }[2], [x22]\n"
+ "st1 { v24.h }[2], [x21]\n"
+ "st1 { v28.h }[2], [x20]\n"
+ "b 292f\n"
+ "291:" // Height 6: Partial direct writeback: partial_1_0
+ "str h8, [x28, #0x0]\n"
+ "str h12, [x24, #0x0]\n"
+ "str h16, [x23, #0x0]\n"
+ "str h20, [x22, #0x0]\n"
+ "str h24, [x21, #0x0]\n"
+ "str h28, [x20, #0x0]\n"
+ "292:" // Height 6: Partial direct writeback: Done
+ "b 294f\n"
+ "293:" // Height 6: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q25, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
+ "str q28, [x20, #0x0]\n"
+ "str q29, [x20, #0x10]\n"
+ "str q30, [x20, #0x20]\n"
+ "str q31, [x20, #0x30]\n"
+ "294:" // Height 6: Writeback done
+ "subs x11, x11, #0x20\n"
+ "bgt 247b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 302f\n"
+ "beq 296f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 301f\n"
+ "tbz %x[flags], #3, 295f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "301:" // Update direct input
+ "295:" // Update direct input
"mov x19, #0xc\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "302:" // Exit
+ "296:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index b4c1ba988f..7f83e617c5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -37,9 +37,9 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void a64_hybrid_fp32_mla_6x16( ARGLIST );
+void a64_hybrid_fp32_mla_6x16_a55( ARGLIST );
class cls_a64_hybrid_fp32_mla_6x16
{
@@ -72,10 +72,11 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 16, 1> transforms = {};
- static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+ {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
- return { 2.287 };
+ return { 3.04 };
case CPUModel::A53:
return { 1.43 };
case CPUModel::A73:
@@ -87,9 +88,16 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_fp32_mla_6x16;
-
- cls_a64_hybrid_fp32_mla_6x16(const CPUInfo *)
+ cls_a64_hybrid_fp32_mla_6x16(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A55r1:
+ case CPUModel::A53:
+ kernel=a64_hybrid_fp32_mla_6x16_a55;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
new file mode 100644
index 0000000000..184cfaf95c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
@@ -0,0 +1,3595 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_6x16_a55 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 166f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 133f\n"
+ "beq 100f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 67f\n"
+ "beq 34f\n"
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x16, %x[bias]\n"
+ "mov x15, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "cbz x16, 3f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "b 14f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 13f\n"
+ "cmp x8, #0x10\n"
+ "bge 12f\n"
+ "tbz x8, #3, 7f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v9.4s }, [x15], #0x10\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v10.4s }, [x15], #0x10\n"
+ "tbz x8, #1, 4f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x15], #0x8\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v11.s }[2], [x15]\n"
+ "b 11f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 11f\n"
+ "ldr s11, [x15, #0x0]\n"
+ "b 11f\n"
+ "5:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x8, #1, 6f\n"
+ "ldr d10, [x15], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v10.s }[2], [x15]\n"
+ "b 11f\n"
+ "6:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 11f\n"
+ "ldr s10, [x15, #0x0]\n"
+ "b 11f\n"
+ "7:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x8, #2, 9f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "tbz x8, #1, 8f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x15], #0x8\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v9.s }[2], [x15]\n"
+ "b 11f\n"
+ "8:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 11f\n"
+ "ldr s9, [x15, #0x0]\n"
+ "b 11f\n"
+ "9:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x8, #1, 10f\n"
+ "ldr d8, [x15], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v8.s }[2], [x15]\n"
+ "b 11f\n"
+ "10:" // Height 1: Partial accumulate: partial_1_0
+ "ldr s8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "11:" // Height 1: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 14f\n"
+ "12:" // Height 1: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "b 14f\n"
+ "13:" // Height 1: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "14:" // Height 1: setup done
+ "mov x14, #0x0\n"
+ "15:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 16f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "cbnz x14, 17f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #2\n"
+ "b 17f\n"
+ "16:" // Height 1: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "17:" // Height 1: input setup done
+ "cmp x13, #0x4\n"
+ "blt 20f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "cmp x13, #0x8\n"
+ "blt 19f\n"
+ "18:" // Height 1: Multiply loop: Main loop head
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "add x12, x12, #0x10\n"
+ "ldr d6, [x17, #0x20]\n"
+ "sub x13, x13, #0x4\n"
+ "ldr x10, [x17, #0x28]\n"
+ "cmp x13, #0x8\n"
+ "mov v7.d[1], x11\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x58]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d0, [x12, #0x0]\n"
+ "mov v0.d[1], x9\n"
+ "bge 18b\n"
+ "19:" // Height 1: Multiply loop: Single iteration only
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "sub x13, x13, #0x4\n"
+ "add x12, x12, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "20:" // Height 1: Multiply loop: Main loop skip
+ "cbz x13, 22f\n"
+ "21:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x12], #0x4\n"
+ "sub x13, x13, #0x1\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "cbnz x13, 21b\n"
+ "22:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 15b\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "tbz %x[flags], #1, 23f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "23:" // Height 1: No activation
+ "cmp x8, #0x10\n"
+ "bge 32f\n"
+ "tbz x8, #3, 27f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v9.4s }, [x15], #0x10\n"
+ "tbz x8, #2, 25f\n"
+ "st1 { v10.4s }, [x15], #0x10\n"
+ "tbz x8, #1, 24f\n"
+ "str d11, [x15], #0x8\n"
+ "tbz x8, #0, 31f\n"
+ "st1 { v11.s }[2], [x15]\n"
+ "b 31f\n"
+ "24:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 31f\n"
+ "str s11, [x15, #0x0]\n"
+ "b 31f\n"
+ "25:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 26f\n"
+ "str d10, [x15], #0x8\n"
+ "tbz x8, #0, 31f\n"
+ "st1 { v10.s }[2], [x15]\n"
+ "b 31f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 31f\n"
+ "str s10, [x15, #0x0]\n"
+ "b 31f\n"
+ "27:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 29f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "tbz x8, #1, 28f\n"
+ "str d9, [x15], #0x8\n"
+ "tbz x8, #0, 31f\n"
+ "st1 { v9.s }[2], [x15]\n"
+ "b 31f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 31f\n"
+ "str s9, [x15, #0x0]\n"
+ "b 31f\n"
+ "29:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 30f\n"
+ "str d8, [x15], #0x8\n"
+ "tbz x8, #0, 31f\n"
+ "st1 { v8.s }[2], [x15]\n"
+ "b 31f\n"
+ "30:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x15, #0x0]\n"
+ "31:" // Height 1: Partial direct writeback: Done
+ "b 33f\n"
+ "32:" // Height 1: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "33:" // Height 1: Writeback done
+ "subs x8, x8, #0x10\n"
+ "bgt 2b\n"
+ "b 200f\n"
+ "34:" // Height 2
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "35:" // Height 2: Column loop
+ "cbz x16, 36f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "mov v13.16b, v9.16b\n"
+ "add x16, x16, #0x40\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "b 47f\n"
+ "36:" // Height 2: no bias
+ "tbz %x[flags], #0, 46f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x10\n"
+ "add x25, x15, x19, LSL #2\n"
+ "bge 45f\n"
+ "tbz x8, #3, 40f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v9.4s }, [x15], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "tbz x8, #2, 38f\n"
+ "ld1 { v10.4s }, [x15], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "tbz x8, #1, 37f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x15], #0x8\n"
+ "ldr d15, [x25], #0x8\n"
+ "tbz x8, #0, 44f\n"
+ "ld1 { v11.s }[2], [x15]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "b 44f\n"
+ "37:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 44f\n"
+ "ldr s11, [x15, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "b 44f\n"
+ "38:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x8, #1, 39f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x8, #0, 44f\n"
+ "ld1 { v10.s }[2], [x15]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "b 44f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 44f\n"
+ "ldr s10, [x15, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "b 44f\n"
+ "40:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x8, #2, 42f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "tbz x8, #1, 41f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x15], #0x8\n"
+ "ldr d13, [x25], #0x8\n"
+ "tbz x8, #0, 44f\n"
+ "ld1 { v9.s }[2], [x15]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "b 44f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 44f\n"
+ "ldr s9, [x15, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "b 44f\n"
+ "42:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x8, #1, 43f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x8, #0, 44f\n"
+ "ld1 { v8.s }[2], [x15]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "b 44f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_0
+ "ldr s8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s12, [x25, #0x0]\n"
+ "44:" // Height 2: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 47f\n"
+ "45:" // Height 2: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "b 47f\n"
+ "46:" // Height 2: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "47:" // Height 2: setup done
+ "mov x14, #0x0\n"
+ "48:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 49f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x14, 50f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "b 50f\n"
+ "49:" // Height 2: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #2\n"
+ "50:" // Height 2: input setup done
+ "cmp x13, #0x4\n"
+ "blt 53f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x8\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 52f\n"
+ "51:" // Height 2: Multiply loop: Main loop head
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "add x12, x12, #0x10\n"
+ "ldr x10, [x17, #0x28]\n"
+ "add x28, x28, #0x10\n"
+ "mov v7.d[1], x11\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sub x13, x13, #0x4\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "cmp x13, #0x8\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x58]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x78]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d1, [x28, #0x0]\n"
+ "mov v0.d[1], x9\n"
+ "mov v1.d[1], x27\n"
+ "bge 51b\n"
+ "52:" // Height 2: Multiply loop: Single iteration only
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "sub x13, x13, #0x4\n"
+ "add x12, x12, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "53:" // Height 2: Multiply loop: Main loop skip
+ "cbz x13, 55f\n"
+ "54:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x12], #0x4\n"
+ "sub x13, x13, #0x1\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "cbnz x13, 54b\n"
+ "55:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 48b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 56f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "56:" // Height 2: No activation
+ "cmp x8, #0x10\n"
+ "bge 65f\n"
+ "tbz x8, #3, 60f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v9.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "tbz x8, #2, 58f\n"
+ "st1 { v10.4s }, [x15], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "tbz x8, #1, 57f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "tbz x8, #0, 64f\n"
+ "st1 { v11.s }[2], [x15]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "b 64f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 64f\n"
+ "str s11, [x15, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "b 64f\n"
+ "58:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 59f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "tbz x8, #0, 64f\n"
+ "st1 { v10.s }[2], [x15]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "b 64f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 64f\n"
+ "str s10, [x15, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "b 64f\n"
+ "60:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 62f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "tbz x8, #1, 61f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "tbz x8, #0, 64f\n"
+ "st1 { v9.s }[2], [x15]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "b 64f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 64f\n"
+ "str s9, [x15, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "b 64f\n"
+ "62:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 63f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "tbz x8, #0, 64f\n"
+ "st1 { v8.s }[2], [x15]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "b 64f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x15, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "64:" // Height 2: Partial direct writeback: Done
+ "b 66f\n"
+ "65:" // Height 2: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "66:" // Height 2: Writeback done
+ "subs x8, x8, #0x10\n"
+ "bgt 35b\n"
+ "b 200f\n"
+ "67:" // Height 3
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "68:" // Height 3: Column loop
+ "cbz x16, 69f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "b 80f\n"
+ "69:" // Height 3: no bias
+ "tbz %x[flags], #0, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x10\n"
+ "add x25, x15, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "bge 78f\n"
+ "tbz x8, #3, 73f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v9.4s }, [x15], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "tbz x8, #2, 71f\n"
+ "ld1 { v10.4s }, [x15], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "tbz x8, #1, 70f\n"
+ "ldr d11, [x15], #0x8\n"
+ "mov x19, #0x38\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "tbz x8, #0, 77f\n"
+ "ld1 { v11.s }[2], [x15]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "b 77f\n"
+ "70:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 77f\n"
+ "ldr s11, [x15, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "b 77f\n"
+ "71:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x8, #1, 72f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "ldr d18, [x24], #0x8\n"
+ "tbz x8, #0, 77f\n"
+ "ld1 { v10.s }[2], [x15]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "b 77f\n"
+ "72:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 77f\n"
+ "ldr s10, [x15, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "b 77f\n"
+ "73:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x8, #2, 75f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "tbz x8, #1, 74f\n"
+ "ldr d9, [x15], #0x8\n"
+ "mov x19, #0x18\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "tbz x8, #0, 77f\n"
+ "ld1 { v9.s }[2], [x15]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "b 77f\n"
+ "74:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 77f\n"
+ "ldr s9, [x15, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "b 77f\n"
+ "75:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x8, #1, 76f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "tbz x8, #0, 77f\n"
+ "ld1 { v8.s }[2], [x15]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "b 77f\n"
+ "76:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s12, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "77:" // Height 3: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 80f\n"
+ "78:" // Height 3: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "b 80f\n"
+ "79:" // Height 3: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "80:" // Height 3: setup done
+ "mov x14, #0x0\n"
+ "81:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 82f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x14, 83f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 83f\n"
+ "82:" // Height 3: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "83:" // Height 3: input setup done
+ "cmp x13, #0x4\n"
+ "blt 86f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x8\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 85f\n"
+ "84:" // Height 3: Multiply loop: Main loop head
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "ldr x10, [x17, #0x28]\n"
+ "add x12, x12, #0x10\n"
+ "mov v7.d[1], x11\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x40]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "ldr x11, [x17, #0x58]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x50]\n"
+ "sub x13, x13, #0x4\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x60]\n"
+ "cmp x13, #0x8\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "mov v7.d[1], x11\n"
+ "ldr d6, [x17, #0x0]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "ldr d1, [x28, #0x0]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "mov v0.d[1], x9\n"
+ "ldr d2, [x26, #0x0]\n"
+ "mov v1.d[1], x27\n"
+ "mov v2.d[1], x25\n"
+ "bge 84b\n"
+ "85:" // Height 3: Multiply loop: Single iteration only
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x13, x13, #0x4\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "86:" // Height 3: Multiply loop: Main loop skip
+ "cbz x13, 88f\n"
+ "87:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x12], #0x4\n"
+ "sub x13, x13, #0x1\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "cbnz x13, 87b\n"
+ "88:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 81b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "tbz %x[flags], #1, 89f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "89:" // Height 3: No activation
+ "cmp x8, #0x10\n"
+ "bge 98f\n"
+ "tbz x8, #3, 93f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v9.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "tbz x8, #2, 91f\n"
+ "st1 { v10.4s }, [x15], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "tbz x8, #1, 90f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "tbz x8, #0, 97f\n"
+ "st1 { v11.s }[2], [x15]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "b 97f\n"
+ "90:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 97f\n"
+ "str s11, [x15, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "b 97f\n"
+ "91:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 92f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "tbz x8, #0, 97f\n"
+ "st1 { v10.s }[2], [x15]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "b 97f\n"
+ "92:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 97f\n"
+ "str s10, [x15, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "b 97f\n"
+ "93:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 95f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "tbz x8, #1, 94f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "tbz x8, #0, 97f\n"
+ "st1 { v9.s }[2], [x15]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "b 97f\n"
+ "94:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 97f\n"
+ "str s9, [x15, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "b 97f\n"
+ "95:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 96f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "tbz x8, #0, 97f\n"
+ "st1 { v8.s }[2], [x15]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "b 97f\n"
+ "96:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x15, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "97:" // Height 3: Partial direct writeback: Done
+ "b 99f\n"
+ "98:" // Height 3: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "99:" // Height 3: Writeback done
+ "subs x8, x8, #0x10\n"
+ "bgt 68b\n"
+ "b 200f\n"
+ "100:" // Height 4
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "101:" // Height 4: Column loop
+ "cbz x16, 102f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "b 113f\n"
+ "102:" // Height 4: no bias
+ "tbz %x[flags], #0, 112f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x10\n"
+ "add x25, x15, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "bge 111f\n"
+ "tbz x8, #3, 106f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v9.4s }, [x15], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "tbz x8, #2, 104f\n"
+ "ld1 { v10.4s }, [x15], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "tbz x8, #1, 103f\n"
+ "ldr d11, [x15], #0x8\n"
+ "mov x19, #0x38\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "tbz x8, #0, 110f\n"
+ "ld1 { v11.s }[2], [x15]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "b 110f\n"
+ "103:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 110f\n"
+ "ldr s11, [x15, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "b 110f\n"
+ "104:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x8, #1, 105f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "tbz x8, #0, 110f\n"
+ "ld1 { v10.s }[2], [x15]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "b 110f\n"
+ "105:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 110f\n"
+ "ldr s10, [x15, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "b 110f\n"
+ "106:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x8, #2, 108f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "tbz x8, #1, 107f\n"
+ "ldr d9, [x15], #0x8\n"
+ "mov x19, #0x18\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "tbz x8, #0, 110f\n"
+ "ld1 { v9.s }[2], [x15]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "b 110f\n"
+ "107:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 110f\n"
+ "ldr s9, [x15, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "b 110f\n"
+ "108:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x8, #1, 109f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "tbz x8, #0, 110f\n"
+ "ld1 { v8.s }[2], [x15]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "b 110f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s12, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "110:" // Height 4: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 113f\n"
+ "111:" // Height 4: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "b 113f\n"
+ "112:" // Height 4: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "113:" // Height 4: setup done
+ "mov x14, #0x0\n"
+ "114:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 115f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x14, 116f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 116f\n"
+ "115:" // Height 4: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "116:" // Height 4: input setup done
+ "cmp x13, #0x4\n"
+ "blt 119f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x8\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 118f\n"
+ "117:" // Height 4: Multiply loop: Main loop head
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr x10, [x17, #0x28]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "mov v7.d[1], x11\n"
+ "ldr x11, [x17, #0x38]\n"
+ "add x12, x12, #0x10\n"
+ "add x28, x28, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "ldr x11, [x17, #0x58]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x70]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "ldr x23, [x24, #0x8]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "sub x13, x13, #0x4\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "cmp x13, #0x8\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "ldr d1, [x28, #0x0]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "mov v0.d[1], x9\n"
+ "mov v1.d[1], x27\n"
+ "ldr d2, [x26, #0x0]\n"
+ "ldr d3, [x24, #0x0]\n"
+ "mov v2.d[1], x25\n"
+ "mov v3.d[1], x23\n"
+ "bge 117b\n"
+ "118:" // Height 4: Multiply loop: Single iteration only
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x13, x13, #0x4\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "119:" // Height 4: Multiply loop: Main loop skip
+ "cbz x13, 121f\n"
+ "120:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x12], #0x4\n"
+ "sub x13, x13, #0x1\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "cbnz x13, 120b\n"
+ "121:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 114b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 122f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "122:" // Height 4: No activation
+ "cmp x8, #0x10\n"
+ "bge 131f\n"
+ "tbz x8, #3, 126f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v9.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "tbz x8, #2, 124f\n"
+ "st1 { v10.4s }, [x15], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "tbz x8, #1, 123f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "tbz x8, #0, 130f\n"
+ "st1 { v11.s }[2], [x15]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "b 130f\n"
+ "123:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 130f\n"
+ "str s11, [x15, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "b 130f\n"
+ "124:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 125f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "tbz x8, #0, 130f\n"
+ "st1 { v10.s }[2], [x15]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "b 130f\n"
+ "125:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 130f\n"
+ "str s10, [x15, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "b 130f\n"
+ "126:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 128f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "tbz x8, #1, 127f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "tbz x8, #0, 130f\n"
+ "st1 { v9.s }[2], [x15]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "b 130f\n"
+ "127:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 130f\n"
+ "str s9, [x15, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "b 130f\n"
+ "128:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 129f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "tbz x8, #0, 130f\n"
+ "st1 { v8.s }[2], [x15]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "b 130f\n"
+ "129:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x15, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "130:" // Height 4: Partial direct writeback: Done
+ "b 132f\n"
+ "131:" // Height 4: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "132:" // Height 4: Writeback done
+ "subs x8, x8, #0x10\n"
+ "bgt 101b\n"
+ "b 200f\n"
+ "133:" // Height 5
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "134:" // Height 5: Column loop
+ "cbz x16, 135f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v24.16b, v8.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "b 146f\n"
+ "135:" // Height 5: no bias
+ "tbz %x[flags], #0, 145f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x10\n"
+ "add x25, x15, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "bge 144f\n"
+ "tbz x8, #3, 139f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v9.4s }, [x15], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "tbz x8, #2, 137f\n"
+ "ld1 { v10.4s }, [x15], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
+ "tbz x8, #1, 136f\n"
+ "ldr d11, [x15], #0x8\n"
+ "mov x19, #0x38\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "tbz x8, #0, 143f\n"
+ "ld1 { v11.s }[2], [x15]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
+ "b 143f\n"
+ "136:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 143f\n"
+ "ldr s11, [x15, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
+ "b 143f\n"
+ "137:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x8, #1, 138f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "tbz x8, #0, 143f\n"
+ "ld1 { v10.s }[2], [x15]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "b 143f\n"
+ "138:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 143f\n"
+ "ldr s10, [x15, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "b 143f\n"
+ "139:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x8, #2, 141f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "tbz x8, #1, 140f\n"
+ "ldr d9, [x15], #0x8\n"
+ "mov x19, #0x18\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "tbz x8, #0, 143f\n"
+ "ld1 { v9.s }[2], [x15]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "b 143f\n"
+ "140:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 143f\n"
+ "ldr s9, [x15, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "b 143f\n"
+ "141:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x8, #1, 142f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "tbz x8, #0, 143f\n"
+ "ld1 { v8.s }[2], [x15]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "b 143f\n"
+ "142:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s12, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "143:" // Height 5: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 146f\n"
+ "144:" // Height 5: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "b 146f\n"
+ "145:" // Height 5: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "146:" // Height 5: setup done
+ "mov x14, #0x0\n"
+ "147:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 148f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x14, 149f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 149f\n"
+ "148:" // Height 5: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "149:" // Height 5: input setup done
+ "cmp x13, #0x4\n"
+ "blt 152f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x8\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 151f\n"
+ "150:" // Height 5: Multiply loop: Main loop head
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr x10, [x17, #0x28]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "mov v7.d[1], x11\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "ldr d7, [x17, #0x30]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "ldr x11, [x17, #0x58]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "ldr x23, [x24, #0x8]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "ldr x21, [x22, #0x8]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "sub x13, x13, #0x4\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "cmp x13, #0x8\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "ldr d1, [x28, #0x0]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "mov v0.d[1], x9\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "mov v1.d[1], x27\n"
+ "ldr d2, [x26, #0x0]\n"
+ "ldr d3, [x24, #0x0]\n"
+ "ldr d4, [x22, #0x0]\n"
+ "mov v2.d[1], x25\n"
+ "mov v3.d[1], x23\n"
+ "mov v4.d[1], x21\n"
+ "bge 150b\n"
+ "151:" // Height 5: Multiply loop: Single iteration only
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x13, x13, #0x4\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "152:" // Height 5: Multiply loop: Main loop skip
+ "cbz x13, 154f\n"
+ "153:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x12], #0x4\n"
+ "sub x13, x13, #0x1\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "cbnz x13, 153b\n"
+ "154:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 147b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "tbz %x[flags], #1, 155f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "155:" // Height 5: No activation
+ "cmp x8, #0x10\n"
+ "bge 164f\n"
+ "tbz x8, #3, 159f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v9.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
+ "tbz x8, #2, 157f\n"
+ "st1 { v10.4s }, [x15], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "tbz x8, #1, 156f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "tbz x8, #0, 163f\n"
+ "st1 { v11.s }[2], [x15]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "b 163f\n"
+ "156:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 163f\n"
+ "str s11, [x15, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
+ "b 163f\n"
+ "157:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 158f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "tbz x8, #0, 163f\n"
+ "st1 { v10.s }[2], [x15]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "b 163f\n"
+ "158:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 163f\n"
+ "str s10, [x15, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
+ "b 163f\n"
+ "159:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 161f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "tbz x8, #1, 160f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "tbz x8, #0, 163f\n"
+ "st1 { v9.s }[2], [x15]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "b 163f\n"
+ "160:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 163f\n"
+ "str s9, [x15, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
+ "b 163f\n"
+ "161:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 162f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "tbz x8, #0, 163f\n"
+ "st1 { v8.s }[2], [x15]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "b 163f\n"
+ "162:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x15, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "163:" // Height 5: Partial direct writeback: Done
+ "b 165f\n"
+ "164:" // Height 5: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "165:" // Height 5: Writeback done
+ "subs x8, x8, #0x10\n"
+ "bgt 134b\n"
+ "b 200f\n"
+ "166:" // Height 6
+ "ldr x8, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x16, %x[bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0x18\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "167:" // Height 6: Column loop
+ "cbz x16, 168f\n"
+ "ldr q8, [x16, #0x0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "mov v12.16b, v8.16b\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v24.16b, v8.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v28.16b, v8.16b\n"
+ "mov v29.16b, v9.16b\n"
+ "mov v30.16b, v10.16b\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "mov v31.16b, v11.16b\n"
+ "b 179f\n"
+ "168:" // Height 6: no bias
+ "tbz %x[flags], #0, 178f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x8, #0x10\n"
+ "add x25, x15, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "bge 177f\n"
+ "tbz x8, #3, 172f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v9.4s }, [x15], #0x10\n"
+ "ld1 { v13.4s }, [x25], #0x10\n"
+ "ld1 { v17.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "ld1 { v21.4s }, [x23], #0x10\n"
+ "ld1 { v25.4s }, [x22], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
+ "tbz x8, #2, 170f\n"
+ "ld1 { v10.4s }, [x15], #0x10\n"
+ "ld1 { v14.4s }, [x25], #0x10\n"
+ "ld1 { v18.4s }, [x24], #0x10\n"
+ "ld1 { v22.4s }, [x23], #0x10\n"
+ "ld1 { v26.4s }, [x22], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
+ "tbz x8, #1, 169f\n"
+ "ldr d11, [x15], #0x8\n"
+ "mov x19, #0x38\n"
+ "ldr d15, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d27, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x8, #0, 176f\n"
+ "ld1 { v11.s }[2], [x15]\n"
+ "ld1 { v15.s }[2], [x25]\n"
+ "ld1 { v19.s }[2], [x24]\n"
+ "ld1 { v23.s }[2], [x23]\n"
+ "ld1 { v27.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 176f\n"
+ "169:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x8, #0, 176f\n"
+ "ldr s11, [x15, #0x0]\n"
+ "ldr s15, [x25, #0x0]\n"
+ "ldr s19, [x24, #0x0]\n"
+ "ldr s23, [x23, #0x0]\n"
+ "ldr s27, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "b 176f\n"
+ "170:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x8, #1, 171f\n"
+ "ldr d10, [x15], #0x8\n"
+ "ldr d14, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "ldr d18, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d26, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "tbz x8, #0, 176f\n"
+ "ld1 { v10.s }[2], [x15]\n"
+ "ld1 { v14.s }[2], [x25]\n"
+ "ld1 { v18.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v26.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 176f\n"
+ "171:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x8, #0, 176f\n"
+ "ldr s10, [x15, #0x0]\n"
+ "ldr s14, [x25, #0x0]\n"
+ "ldr s18, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s26, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "b 176f\n"
+ "172:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x8, #2, 174f\n"
+ "ld1 { v8.4s }, [x15], #0x10\n"
+ "ld1 { v12.4s }, [x25], #0x10\n"
+ "ld1 { v16.4s }, [x24], #0x10\n"
+ "ld1 { v20.4s }, [x23], #0x10\n"
+ "ld1 { v24.4s }, [x22], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "tbz x8, #1, 173f\n"
+ "ldr d9, [x15], #0x8\n"
+ "mov x19, #0x18\n"
+ "ldr d13, [x25], #0x8\n"
+ "ldr d17, [x24], #0x8\n"
+ "ldr d21, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x8, #0, 176f\n"
+ "ld1 { v9.s }[2], [x15]\n"
+ "ld1 { v13.s }[2], [x25]\n"
+ "ld1 { v17.s }[2], [x24]\n"
+ "ld1 { v21.s }[2], [x23]\n"
+ "ld1 { v25.s }[2], [x22]\n"
+ "ld1 { v29.s }[2], [x21]\n"
+ "b 176f\n"
+ "173:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x8, #0, 176f\n"
+ "ldr s9, [x15, #0x0]\n"
+ "ldr s13, [x25, #0x0]\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s21, [x23, #0x0]\n"
+ "ldr s25, [x22, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
+ "b 176f\n"
+ "174:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x8, #1, 175f\n"
+ "ldr d8, [x15], #0x8\n"
+ "ldr d12, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d16, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "tbz x8, #0, 176f\n"
+ "ld1 { v8.s }[2], [x15]\n"
+ "ld1 { v12.s }[2], [x25]\n"
+ "ld1 { v16.s }[2], [x24]\n"
+ "ld1 { v20.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v28.s }[2], [x21]\n"
+ "b 176f\n"
+ "175:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s8, [x15, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s12, [x25, #0x0]\n"
+ "ldr s16, [x24, #0x0]\n"
+ "ldr s20, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
+ "176:" // Height 6: Partial accumulate: Done
+ "sub x15, x15, x19\n"
+ "b 179f\n"
+ "177:" // Height 6: full accumulate
+ "ldr q8, [x15, #0x0]\n"
+ "ldr q9, [x15, #0x10]\n"
+ "ldr q10, [x15, #0x20]\n"
+ "ldr q11, [x15, #0x30]\n"
+ "ldr q12, [x25, #0x0]\n"
+ "ldr q13, [x25, #0x10]\n"
+ "ldr q14, [x25, #0x20]\n"
+ "ldr q15, [x25, #0x30]\n"
+ "ldr q16, [x24, #0x0]\n"
+ "ldr q17, [x24, #0x10]\n"
+ "ldr q18, [x24, #0x20]\n"
+ "ldr q19, [x24, #0x30]\n"
+ "ldr q20, [x23, #0x0]\n"
+ "ldr q21, [x23, #0x10]\n"
+ "ldr q22, [x23, #0x20]\n"
+ "ldr q23, [x23, #0x30]\n"
+ "ldr q24, [x22, #0x0]\n"
+ "ldr q25, [x22, #0x10]\n"
+ "ldr q26, [x22, #0x20]\n"
+ "ldr q27, [x22, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 179f\n"
+ "178:" // Height 6: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "179:" // Height 6: setup done
+ "mov x14, #0x0\n"
+ "180:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w13, [x20, x14, LSL #0x2]\n"
+ "tbz %x[flags], #3, 181f\n"
+ "ldr x20, [%x[input_ptr], x14, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x12, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x14, 182f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 182f\n"
+ "181:" // Height 6: setup direct input
+ "mov x12, %x[input_ptr]\n"
+ "add x28, x12, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "182:" // Height 6: input setup done
+ "cmp x13, #0x4\n"
+ "blt 185f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "cmp x13, #0x8\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x17, #0x0]\n"
+ "blt 184f\n"
+ "183:" // Height 6: Multiply loop: Main loop head
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr d7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr x11, [x17, #0x18]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr x10, [x17, #0x28]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr d6, [x17, #0x20]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "ldr x11, [x17, #0x38]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "ldr x9, [x12, #0x8]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "ldr x10, [x17, #0x48]\n"
+ "fmla v29.4s, v7.4s, v5.s[0]\n"
+ "ldr d7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "ldr x11, [x17, #0x58]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "ldr x27, [x28, #0x8]\n"
+ "fmla v30.4s, v6.4s, v5.s[0]\n"
+ "ldr d6, [x17, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "ldr x10, [x17, #0x68]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "ldr d7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "ldr x11, [x17, #0x78]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "ldr x23, [x24, #0x8]\n"
+ "fmla v28.4s, v6.4s, v5.s[1]\n"
+ "ldr d6, [x17, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "ldr x10, [x17, #0x88]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "ldr x21, [x22, #0x8]\n"
+ "fmla v29.4s, v7.4s, v5.s[1]\n"
+ "ldr d7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "ldr x11, [x17, #0x98]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "ldr x19, [x20, #0x8]\n"
+ "fmla v30.4s, v6.4s, v5.s[1]\n"
+ "ldr d6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "sub x13, x13, #0x4\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "cmp x13, #0x8\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "ldr x10, [x17, #0xa8]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "fmla v31.4s, v7.4s, v5.s[1]\n"
+ "ldr d7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "ldr x11, [x17, #0xb8]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "fmla v28.4s, v6.4s, v5.s[2]\n"
+ "ldr d6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "ldr x10, [x17, #0xc8]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "fmla v29.4s, v7.4s, v5.s[2]\n"
+ "ldr d7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "ldr x11, [x17, #0xd8]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "fmla v30.4s, v6.4s, v5.s[2]\n"
+ "ldr d6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "ldr x10, [x17, #0xe8]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "fmla v31.4s, v7.4s, v5.s[2]\n"
+ "ldr d7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "ldr x11, [x17, #0xf8]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "fmla v28.4s, v6.4s, v5.s[3]\n"
+ "ldr d6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "fmla v29.4s, v7.4s, v5.s[3]\n"
+ "ldr d7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "ldr x10, [x17, #0x8]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "mov v7.d[1], x11\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v30.4s, v6.4s, v5.s[3]\n"
+ "ldr d6, [x17, #0x0]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr d0, [x12, #0x0]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "ldr d1, [x28, #0x0]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "mov v6.d[1], x10\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "mov v0.d[1], x9\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "mov v1.d[1], x27\n"
+ "fmla v31.4s, v7.4s, v5.s[3]\n"
+ "ldr d2, [x26, #0x0]\n"
+ "ldr d3, [x24, #0x0]\n"
+ "ldr d4, [x22, #0x0]\n"
+ "mov v2.d[1], x25\n"
+ "ldr d5, [x20, #0x0]\n"
+ "mov v3.d[1], x23\n"
+ "mov v4.d[1], x21\n"
+ "mov v5.d[1], x19\n"
+ "bge 183b\n"
+ "184:" // Height 6: Multiply loop: Single iteration only
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x13, x13, #0x4\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v29.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v30.4s, v6.4s, v5.s[0]\n"
+ "ldr q6, [x17, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x17, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "fmla v28.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x17, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "fmla v29.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x17, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "fmla v30.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x17, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "fmla v31.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x17, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "fmla v28.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x17, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "fmla v29.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x17, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "fmla v30.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x17, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "fmla v31.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x17, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "fmla v28.4s, v6.4s, v5.s[3]\n"
+ "ldr q6, [x17, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "fmla v29.4s, v7.4s, v5.s[3]\n"
+ "ldr q7, [x17, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x17, x17, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v30.4s, v6.4s, v5.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v31.4s, v7.4s, v5.s[3]\n"
+ "185:" // Height 6: Multiply loop: Main loop skip
+ "cbz x13, 187f\n"
+ "186:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x12], #0x4\n"
+ "sub x13, x13, #0x1\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x17, #0x0]\n"
+ "ldr q7, [x17, #0x10]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "fmla v29.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x17, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x17, x17, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v30.4s, v6.4s, v5.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "cbnz x13, 186b\n"
+ "187:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x19\n"
+ "bne 180b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x15, #0x0]\n"
+ "add x25, x15, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 188f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "fmin v28.4s, v28.4s, v0.4s\n"
+ "fmin v29.4s, v29.4s, v0.4s\n"
+ "fmin v30.4s, v30.4s, v0.4s\n"
+ "fmin v31.4s, v31.4s, v0.4s\n"
+ "fmax v28.4s, v28.4s, v1.4s\n"
+ "fmax v29.4s, v29.4s, v1.4s\n"
+ "fmax v30.4s, v30.4s, v1.4s\n"
+ "fmax v31.4s, v31.4s, v1.4s\n"
+ "188:" // Height 6: No activation
+ "cmp x8, #0x10\n"
+ "bge 197f\n"
+ "tbz x8, #3, 192f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v9.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v13.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v17.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v21.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v25.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
+ "tbz x8, #2, 190f\n"
+ "st1 { v10.4s }, [x15], #0x10\n"
+ "st1 { v14.4s }, [x25], #0x10\n"
+ "st1 { v18.4s }, [x24], #0x10\n"
+ "st1 { v22.4s }, [x23], #0x10\n"
+ "st1 { v26.4s }, [x22], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
+ "tbz x8, #1, 189f\n"
+ "str d11, [x15], #0x8\n"
+ "str d15, [x25], #0x8\n"
+ "str d19, [x24], #0x8\n"
+ "str d23, [x23], #0x8\n"
+ "str d27, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x8, #0, 196f\n"
+ "st1 { v11.s }[2], [x15]\n"
+ "st1 { v15.s }[2], [x25]\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "st1 { v23.s }[2], [x23]\n"
+ "st1 { v27.s }[2], [x22]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 196f\n"
+ "189:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x8, #0, 196f\n"
+ "str s11, [x15, #0x0]\n"
+ "str s15, [x25, #0x0]\n"
+ "str s19, [x24, #0x0]\n"
+ "str s23, [x23, #0x0]\n"
+ "str s27, [x22, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "b 196f\n"
+ "190:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x8, #1, 191f\n"
+ "str d10, [x15], #0x8\n"
+ "str d14, [x25], #0x8\n"
+ "str d18, [x24], #0x8\n"
+ "str d22, [x23], #0x8\n"
+ "str d26, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x8, #0, 196f\n"
+ "st1 { v10.s }[2], [x15]\n"
+ "st1 { v14.s }[2], [x25]\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v22.s }[2], [x23]\n"
+ "st1 { v26.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 196f\n"
+ "191:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x8, #0, 196f\n"
+ "str s10, [x15, #0x0]\n"
+ "str s14, [x25, #0x0]\n"
+ "str s18, [x24, #0x0]\n"
+ "str s22, [x23, #0x0]\n"
+ "str s26, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "b 196f\n"
+ "192:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x8, #2, 194f\n"
+ "st1 { v8.4s }, [x15], #0x10\n"
+ "st1 { v12.4s }, [x25], #0x10\n"
+ "st1 { v16.4s }, [x24], #0x10\n"
+ "st1 { v20.4s }, [x23], #0x10\n"
+ "st1 { v24.4s }, [x22], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "tbz x8, #1, 193f\n"
+ "str d9, [x15], #0x8\n"
+ "str d13, [x25], #0x8\n"
+ "str d17, [x24], #0x8\n"
+ "str d21, [x23], #0x8\n"
+ "str d25, [x22], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x8, #0, 196f\n"
+ "st1 { v9.s }[2], [x15]\n"
+ "st1 { v13.s }[2], [x25]\n"
+ "st1 { v17.s }[2], [x24]\n"
+ "st1 { v21.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "b 196f\n"
+ "193:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x8, #0, 196f\n"
+ "str s9, [x15, #0x0]\n"
+ "str s13, [x25, #0x0]\n"
+ "str s17, [x24, #0x0]\n"
+ "str s21, [x23, #0x0]\n"
+ "str s25, [x22, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
+ "b 196f\n"
+ "194:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x8, #1, 195f\n"
+ "str d8, [x15], #0x8\n"
+ "str d12, [x25], #0x8\n"
+ "str d16, [x24], #0x8\n"
+ "str d20, [x23], #0x8\n"
+ "str d24, [x22], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x8, #0, 196f\n"
+ "st1 { v8.s }[2], [x15]\n"
+ "st1 { v12.s }[2], [x25]\n"
+ "st1 { v16.s }[2], [x24]\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "st1 { v24.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "b 196f\n"
+ "195:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x15, #0x0]\n"
+ "str s12, [x25, #0x0]\n"
+ "str s16, [x24, #0x0]\n"
+ "str s20, [x23, #0x0]\n"
+ "str s24, [x22, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
+ "196:" // Height 6: Partial direct writeback: Done
+ "b 198f\n"
+ "197:" // Height 6: Full writeback
+ "str q8, [x15, #0x0]\n"
+ "str q9, [x15, #0x10]\n"
+ "str q10, [x15, #0x20]\n"
+ "str q11, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "str q12, [x25, #0x0]\n"
+ "str q13, [x25, #0x10]\n"
+ "str q14, [x25, #0x20]\n"
+ "str q15, [x25, #0x30]\n"
+ "str q16, [x24, #0x0]\n"
+ "str q17, [x24, #0x10]\n"
+ "str q18, [x24, #0x20]\n"
+ "str q19, [x24, #0x30]\n"
+ "str q20, [x23, #0x0]\n"
+ "str q21, [x23, #0x10]\n"
+ "str q22, [x23, #0x20]\n"
+ "str q23, [x23, #0x30]\n"
+ "str q24, [x22, #0x0]\n"
+ "str q25, [x22, #0x10]\n"
+ "str q26, [x22, #0x20]\n"
+ "str q27, [x22, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "198:" // Height 6: Writeback done
+ "subs x8, x8, #0x10\n"
+ "bgt 167b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 200f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 199f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "199:" // Update direct input
+ "mov x19, #0x18\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "200:" // Exit
+
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
index a645954edd..f5504b44d4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -95,222 +95,216 @@ void a64_hybrid_fp32_mla_6x16 (
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 171f\n"
+ "bge 166f\n"
"cmp %x[M], #0x4\n"
- "bgt 137f\n"
- "beq 103f\n"
+ "bgt 133f\n"
+ "beq 100f\n"
"cmp %x[M], #0x2\n"
- "bgt 69f\n"
- "beq 35f\n"
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[bias]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
- "cbz x14, 4f\n"
- "ldr q8, [x14, #0x0]\n"
- "ldr q9, [x14, #0x10]\n"
- "ldr q10, [x14, #0x20]\n"
- "ldr q11, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
- "b 15f\n"
- "4:" // Height 1: no bias
- "tbz %x[flags], #0, 14f\n"
- "cmp x16, #0x10\n"
- "bge 13f\n"
- "tbz x16, #3, 8f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "tbz x16, #2, 6f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "tbz x16, #1, 5f\n"
+ "bgt 67f\n"
+ "beq 34f\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[bias]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "cbz x9, 3f\n"
+ "ldr q8, [x9, #0x0]\n"
+ "ldr q9, [x9, #0x10]\n"
+ "ldr q10, [x9, #0x20]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
+ "b 14f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 13f\n"
+ "cmp x11, #0x10\n"
+ "bge 12f\n"
+ "tbz x11, #3, 7f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "tbz x11, #2, 5f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "tbz x11, #1, 4f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "tbz x16, #0, 12f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "b 12f\n"
- "5:" // Height 1: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "tbz x11, #0, 11f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "b 11f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 12f\n"
- "ldr s11, [x13, #0x0]\n"
- "b 12f\n"
- "6:" // Height 1: Partial accumulate: partial_2_8
- "tbz x16, #1, 7f\n"
- "ldr d10, [x13], #0x8\n"
+ "tbz x11, #0, 11f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "b 11f\n"
+ "5:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x11, #1, 6f\n"
+ "ldr d10, [x28], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 12f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "b 12f\n"
- "7:" // Height 1: Partial accumulate: partial_1_8
+ "tbz x11, #0, 11f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "b 11f\n"
+ "6:" // Height 1: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 12f\n"
- "ldr s10, [x13, #0x0]\n"
- "b 12f\n"
- "8:" // Height 1: Partial accumulate: partial_4_0
- "tbz x16, #2, 10f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "tbz x16, #1, 9f\n"
+ "tbz x11, #0, 11f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "b 11f\n"
+ "7:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x11, #2, 9f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "tbz x11, #1, 8f\n"
+ "ldr d9, [x28], #0x8\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "tbz x16, #0, 12f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "b 12f\n"
- "9:" // Height 1: Partial accumulate: partial_1_4
+ "tbz x11, #0, 11f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "b 11f\n"
+ "8:" // Height 1: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 12f\n"
- "ldr s9, [x13, #0x0]\n"
- "b 12f\n"
- "10:" // Height 1: Partial accumulate: partial_2_0
- "tbz x16, #1, 11f\n"
- "ldr d8, [x13], #0x8\n"
+ "tbz x11, #0, 11f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "b 11f\n"
+ "9:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x11, #1, 10f\n"
+ "ldr d8, [x28], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 12f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "b 12f\n"
- "11:" // Height 1: Partial accumulate: partial_1_0
+ "tbz x11, #0, 11f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "b 11f\n"
+ "10:" // Height 1: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "12:" // Height 1: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "b 15f\n"
- "13:" // Height 1: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "b 15f\n"
- "14:" // Height 1: no accumulate
+ "11:" // Height 1: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 14f\n"
+ "12:" // Height 1: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "b 14f\n"
+ "13:" // Height 1: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
"movi v11.16b, #0x0\n"
- "15:" // Height 1: setup done
- "mov x12, #0x0\n"
- "16:" // Height 1: String loop
+ "14:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "15:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 17f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 16f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 18f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 17f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "b 18f\n"
- "17:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "18:" // Height 1: input setup done
- "cmp x11, #0x4\n"
- "blt 21f\n"
- "cmp x11, #0x8\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 17f\n"
+ "16:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "17:" // Height 1: input setup done
+ "cmp x26, #0x4\n"
"blt 20f\n"
- "19:" // Height 1: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "cmp x26, #0x8\n"
+ "blt 19f\n"
+ "18:" // Height 1: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "sub x26, x26, #0x4\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "cmp x26, #0x8\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q6, [x10, #0x40]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
- "sub x11, x11, #0x4\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
- "cmp x11, #0x8\n"
+ "ldr q7, [x10, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.4s, v7.4s, v0.s[3]\n"
- "bge 19b\n"
- "20:" // Height 1: Multiply loop: Single iteration only
- "sub x11, x11, #0x4\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "bge 18b\n"
+ "19:" // Height 1: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x4\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
"fmla v11.4s, v7.4s, v0.s[3]\n"
- "21:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 23f\n"
- "22:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "20:" // Height 1: Multiply loop: Main loop skip
+ "cbz x26, 22f\n"
+ "21:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x1\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "sub x11, x11, #0x1\n"
- "add x15, x15, #0x40\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
- "cbnz x11, 22b\n"
- "23:" // Height 1: Multiply loop: No odd multiplies
+ "cbnz x26, 21b\n"
+ "22:" // Height 1: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 16b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "tbz %x[flags], #1, 24f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 15b\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "tbz %x[flags], #1, 23f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -323,178 +317,170 @@ void a64_hybrid_fp32_mla_6x16 (
"fmax v9.4s, v9.4s, v1.4s\n"
"fmax v10.4s, v10.4s, v1.4s\n"
"fmax v11.4s, v11.4s, v1.4s\n"
- "24:" // Height 1: No activation
- "cmp x16, #0x10\n"
- "bge 33f\n"
- "tbz x16, #3, 28f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "tbz x16, #2, 26f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "tbz x16, #1, 25f\n"
- "str d11, [x13], #0x8\n"
- "tbz x16, #0, 32f\n"
- "st1 { v11.s }[2], [x13]\n"
- "b 32f\n"
- "25:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x16, #0, 32f\n"
- "str s11, [x13, #0x0]\n"
- "b 32f\n"
- "26:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x16, #1, 27f\n"
- "str d10, [x13], #0x8\n"
- "tbz x16, #0, 32f\n"
- "st1 { v10.s }[2], [x13]\n"
- "b 32f\n"
- "27:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x16, #0, 32f\n"
- "str s10, [x13, #0x0]\n"
- "b 32f\n"
- "28:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x16, #2, 30f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "tbz x16, #1, 29f\n"
- "str d9, [x13], #0x8\n"
- "tbz x16, #0, 32f\n"
- "st1 { v9.s }[2], [x13]\n"
- "b 32f\n"
- "29:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x16, #0, 32f\n"
- "str s9, [x13, #0x0]\n"
- "b 32f\n"
- "30:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x16, #1, 31f\n"
- "str d8, [x13], #0x8\n"
- "tbz x16, #0, 32f\n"
- "st1 { v8.s }[2], [x13]\n"
- "b 32f\n"
- "31:" // Height 1: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "32:" // Height 1: Partial direct writeback: Done
- "b 34f\n"
- "33:" // Height 1: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "add x13, x13, #0x40\n"
- "34:" // Height 1: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 3b\n"
- "b 206f\n"
- "35:" // Height 2
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 36f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #2\n"
- "b 37f\n"
- "36:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "37:" // Height 2: Column loop
- "cbz x14, 38f\n"
- "ldr q8, [x14, #0x0]\n"
+ "23:" // Height 1: No activation
+ "cmp x11, #0x10\n"
+ "bge 32f\n"
+ "tbz x11, #3, 27f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "tbz x11, #2, 25f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "tbz x11, #1, 24f\n"
+ "str d11, [x28], #0x8\n"
+ "tbz x11, #0, 31f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "b 31f\n"
+ "24:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 31f\n"
+ "str s11, [x28, #0x0]\n"
+ "b 31f\n"
+ "25:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 26f\n"
+ "str d10, [x28], #0x8\n"
+ "tbz x11, #0, 31f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "b 31f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 31f\n"
+ "str s10, [x28, #0x0]\n"
+ "b 31f\n"
+ "27:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 29f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "tbz x11, #1, 28f\n"
+ "str d9, [x28], #0x8\n"
+ "tbz x11, #0, 31f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "b 31f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 31f\n"
+ "str s9, [x28, #0x0]\n"
+ "b 31f\n"
+ "29:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 30f\n"
+ "str d8, [x28], #0x8\n"
+ "tbz x11, #0, 31f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "b 31f\n"
+ "30:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "31:" // Height 1: Partial direct writeback: Done
+ "b 33f\n"
+ "32:" // Height 1: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "33:" // Height 1: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 2b\n"
+ "b 200f\n"
+ "34:" // Height 2
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "35:" // Height 2: Column loop
+ "cbz x9, 36f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q9, [x9, #0x10]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v13.16b, v9.16b\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v14.16b, v10.16b\n"
- "add x14, x14, #0x40\n"
"mov v15.16b, v11.16b\n"
- "b 49f\n"
- "38:" // Height 2: no bias
- "tbz %x[flags], #0, 48f\n"
- "cmp x16, #0x10\n"
- "bge 47f\n"
- "tbz x16, #3, 42f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "tbz x16, #2, 40f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "tbz x16, #1, 39f\n"
+ "b 47f\n"
+ "36:" // Height 2: no bias
+ "tbz %x[flags], #0, 46f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "bge 45f\n"
+ "tbz x11, #3, 40f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "tbz x11, #2, 38f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "tbz x11, #1, 37f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "tbz x16, #0, 46f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "b 46f\n"
- "39:" // Height 2: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "tbz x11, #0, 44f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "b 44f\n"
+ "37:" // Height 2: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 46f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "b 46f\n"
- "40:" // Height 2: Partial accumulate: partial_2_8
- "tbz x16, #1, 41f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
+ "tbz x11, #0, 44f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "b 44f\n"
+ "38:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x11, #1, 39f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 46f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "b 46f\n"
- "41:" // Height 2: Partial accumulate: partial_1_8
+ "tbz x11, #0, 44f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "b 44f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 46f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "b 46f\n"
- "42:" // Height 2: Partial accumulate: partial_4_0
- "tbz x16, #2, 44f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "tbz x16, #1, 43f\n"
+ "tbz x11, #0, 44f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "b 44f\n"
+ "40:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x11, #2, 42f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "tbz x11, #1, 41f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "tbz x16, #0, 46f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "b 46f\n"
- "43:" // Height 2: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "tbz x11, #0, 44f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "b 44f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 46f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "b 46f\n"
- "44:" // Height 2: Partial accumulate: partial_2_0
- "tbz x16, #1, 45f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
+ "tbz x11, #0, 44f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "b 44f\n"
+ "42:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x11, #1, 43f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 46f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "b 46f\n"
- "45:" // Height 2: Partial accumulate: partial_1_0
+ "tbz x11, #0, 44f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "b 44f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "46:" // Height 2: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "b 49f\n"
- "47:" // Height 2: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "b 49f\n"
- "48:" // Height 2: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "44:" // Height 2: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 47f\n"
+ "45:" // Height 2: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "b 47f\n"
+ "46:" // Height 2: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -503,174 +489,176 @@ void a64_hybrid_fp32_mla_6x16 (
"movi v13.16b, #0x0\n"
"movi v14.16b, #0x0\n"
"movi v15.16b, #0x0\n"
- "49:" // Height 2: setup done
- "mov x12, #0x0\n"
- "50:" // Height 2: String loop
+ "47:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 51f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 49f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 52f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 50f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "b 52f\n"
- "51:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "52:" // Height 2: input setup done
- "cmp x11, #0x4\n"
- "blt 55f\n"
- "cmp x11, #0x8\n"
- "blt 54f\n"
- "53:" // Height 2: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 50f\n"
+ "49:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "50:" // Height 2: input setup done
+ "cmp x26, #0x4\n"
+ "blt 53f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x8\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 52f\n"
+ "51:" // Height 2: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x24, x24, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x4\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "cmp x26, #0x8\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "sub x11, x11, #0x4\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
- "cmp x11, #0x8\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
- "bge 53b\n"
- "54:" // Height 2: Multiply loop: Single iteration only
- "sub x11, x11, #0x4\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "bge 51b\n"
+ "52:" // Height 2: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x4\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v11.4s, v7.4s, v0.s[3]\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
- "55:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 57f\n"
- "56:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "53:" // Height 2: Multiply loop: Main loop skip
+ "cbz x26, 55f\n"
+ "54:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x1\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
- "sub x11, x11, #0x1\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
- "cbnz x11, 56b\n"
- "57:" // Height 2: Multiply loop: No odd multiplies
+ "cbnz x26, 54b\n"
+ "55:" // Height 2: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 50b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "tbz %x[flags], #1, 58f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 48b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "tbz %x[flags], #1, 56f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -691,227 +679,215 @@ void a64_hybrid_fp32_mla_6x16 (
"fmax v14.4s, v14.4s, v1.4s\n"
"fmin v15.4s, v15.4s, v0.4s\n"
"fmax v15.4s, v15.4s, v1.4s\n"
- "58:" // Height 2: No activation
- "cmp x16, #0x10\n"
- "bge 67f\n"
- "tbz x16, #3, 62f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "tbz x16, #2, 60f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "tbz x16, #1, 59f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "tbz x16, #0, 66f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "b 66f\n"
- "59:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x16, #0, 66f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "b 66f\n"
- "60:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x16, #1, 61f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "tbz x16, #0, 66f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "b 66f\n"
- "61:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x16, #0, 66f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "b 66f\n"
- "62:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x16, #2, 64f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "tbz x16, #1, 63f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "tbz x16, #0, 66f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "b 66f\n"
- "63:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x16, #0, 66f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
+ "56:" // Height 2: No activation
+ "cmp x11, #0x10\n"
+ "bge 65f\n"
+ "tbz x11, #3, 60f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "tbz x11, #2, 58f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "tbz x11, #1, 57f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "tbz x11, #0, 64f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "b 64f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 64f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "b 64f\n"
+ "58:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 59f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "tbz x11, #0, 64f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "b 64f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 64f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "b 64f\n"
+ "60:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 62f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "tbz x11, #1, 61f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "tbz x11, #0, 64f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "b 64f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 64f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "b 64f\n"
+ "62:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 63f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "tbz x11, #0, 64f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "b 64f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "64:" // Height 2: Partial direct writeback: Done
"b 66f\n"
- "64:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x16, #1, 65f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "tbz x16, #0, 66f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "b 66f\n"
- "65:" // Height 2: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "66:" // Height 2: Partial direct writeback: Done
- "b 68f\n"
- "67:" // Height 2: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "68:" // Height 2: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 37b\n"
- "b 206f\n"
- "69:" // Height 3
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 70f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 71f\n"
- "70:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "71:" // Height 3: Column loop
- "cbz x14, 72f\n"
- "ldr q8, [x14, #0x0]\n"
+ "65:" // Height 2: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "66:" // Height 2: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 35b\n"
+ "b 200f\n"
+ "67:" // Height 3
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "68:" // Height 3: Column loop
+ "cbz x9, 69f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q10, [x9, #0x20]\n"
+ "ldr q11, [x9, #0x30]\n"
"mov v13.16b, v9.16b\n"
- "add x14, x14, #0x40\n"
+ "add x9, x9, #0x40\n"
"mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
- "b 83f\n"
- "72:" // Height 3: no bias
- "tbz %x[flags], #0, 82f\n"
- "cmp x16, #0x10\n"
- "bge 81f\n"
- "tbz x16, #3, 76f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "tbz x16, #2, 74f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "tbz x16, #1, 73f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "tbz x16, #0, 80f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
"b 80f\n"
- "73:" // Height 3: Partial accumulate: partial_1_12
+ "69:" // Height 3: no bias
+ "tbz %x[flags], #0, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "bge 78f\n"
+ "tbz x11, #3, 73f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "tbz x11, #2, 71f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "tbz x11, #1, 70f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "tbz x11, #0, 77f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "b 77f\n"
+ "70:" // Height 3: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 80f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "b 80f\n"
- "74:" // Height 3: Partial accumulate: partial_2_8
- "tbz x16, #1, 75f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
+ "tbz x11, #0, 77f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "b 77f\n"
+ "71:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x11, #1, 72f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 80f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "b 80f\n"
- "75:" // Height 3: Partial accumulate: partial_1_8
+ "ldr d18, [x23], #0x8\n"
+ "tbz x11, #0, 77f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "b 77f\n"
+ "72:" // Height 3: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 80f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "b 80f\n"
- "76:" // Height 3: Partial accumulate: partial_4_0
- "tbz x16, #2, 78f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "tbz x16, #1, 77f\n"
+ "tbz x11, #0, 77f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "b 77f\n"
+ "73:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x11, #2, 75f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "tbz x11, #1, 74f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "tbz x16, #0, 80f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "b 80f\n"
- "77:" // Height 3: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "tbz x11, #0, 77f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "b 77f\n"
+ "74:" // Height 3: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 80f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "b 80f\n"
- "78:" // Height 3: Partial accumulate: partial_2_0
- "tbz x16, #1, 79f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
+ "tbz x11, #0, 77f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "b 77f\n"
+ "75:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x11, #1, 76f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 80f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "b 80f\n"
- "79:" // Height 3: Partial accumulate: partial_1_0
+ "ldr d16, [x23], #0x8\n"
+ "tbz x11, #0, 77f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "b 77f\n"
+ "76:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "80:" // Height 3: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "b 83f\n"
- "81:" // Height 3: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "b 83f\n"
- "82:" // Height 3: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "77:" // Height 3: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 80f\n"
+ "78:" // Height 3: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "b 80f\n"
+ "79:" // Height 3: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -924,221 +900,224 @@ void a64_hybrid_fp32_mla_6x16 (
"movi v17.16b, #0x0\n"
"movi v18.16b, #0x0\n"
"movi v19.16b, #0x0\n"
- "83:" // Height 3: setup done
- "mov x12, #0x0\n"
- "84:" // Height 3: String loop
+ "80:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "81:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 85f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 82f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 86f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 83f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "add x26, x26, x19, LSL #2\n"
- "b 86f\n"
- "85:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "86:" // Height 3: input setup done
- "cmp x11, #0x4\n"
- "blt 89f\n"
- "cmp x11, #0x8\n"
- "blt 88f\n"
- "87:" // Height 3: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 83f\n"
+ "82:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "83:" // Height 3: input setup done
+ "cmp x26, #0x4\n"
+ "blt 86f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x8\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 85f\n"
+ "84:" // Height 3: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x23, x23, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x26, x26, #0x4\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "cmp x26, #0x8\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "sub x11, x11, #0x4\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
- "cmp x11, #0x8\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
+ "ldr q1, [x24, #0x0]\n"
"fmla v19.4s, v7.4s, v2.s[3]\n"
- "bge 87b\n"
- "88:" // Height 3: Multiply loop: Single iteration only
- "sub x11, x11, #0x4\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "bge 84b\n"
+ "85:" // Height 3: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x4\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "add x24, x24, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x23, x23, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x15, #0x30]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
"fmla v11.4s, v7.4s, v0.s[3]\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
"fmla v19.4s, v7.4s, v2.s[3]\n"
- "89:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 91f\n"
- "90:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "86:" // Height 3: Multiply loop: Main loop skip
+ "cbz x26, 88f\n"
+ "87:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x1\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "sub x11, x11, #0x1\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
- "cbnz x11, 90b\n"
- "91:" // Height 3: Multiply loop: No odd multiplies
+ "cbnz x26, 87b\n"
+ "88:" // Height 3: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 84b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "tbz %x[flags], #1, 92f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 81b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 89f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1167,137 +1146,120 @@ void a64_hybrid_fp32_mla_6x16 (
"fmin v19.4s, v19.4s, v0.4s\n"
"fmax v18.4s, v18.4s, v1.4s\n"
"fmax v19.4s, v19.4s, v1.4s\n"
- "92:" // Height 3: No activation
- "cmp x16, #0x10\n"
- "bge 101f\n"
- "tbz x16, #3, 96f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "tbz x16, #2, 94f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "tbz x16, #1, 93f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "tbz x16, #0, 100f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "b 100f\n"
- "93:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x16, #0, 100f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "b 100f\n"
- "94:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x16, #1, 95f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "tbz x16, #0, 100f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "b 100f\n"
- "95:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x16, #0, 100f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "b 100f\n"
- "96:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x16, #2, 98f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "tbz x16, #1, 97f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "tbz x16, #0, 100f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "b 100f\n"
- "97:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x16, #0, 100f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "b 100f\n"
- "98:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x16, #1, 99f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "tbz x16, #0, 100f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "b 100f\n"
- "99:" // Height 3: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "100:" // Height 3: Partial direct writeback: Done
- "b 102f\n"
- "101:" // Height 3: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "102:" // Height 3: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 71b\n"
- "b 206f\n"
- "103:" // Height 4
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 104f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 105f\n"
- "104:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "105:" // Height 4: Column loop
- "cbz x14, 106f\n"
- "ldr q8, [x14, #0x0]\n"
+ "89:" // Height 3: No activation
+ "cmp x11, #0x10\n"
+ "bge 98f\n"
+ "tbz x11, #3, 93f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "tbz x11, #2, 91f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "tbz x11, #1, 90f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "tbz x11, #0, 97f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "b 97f\n"
+ "90:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 97f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "b 97f\n"
+ "91:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 92f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "tbz x11, #0, 97f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "b 97f\n"
+ "92:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 97f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "b 97f\n"
+ "93:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 95f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "tbz x11, #1, 94f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "tbz x11, #0, 97f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "b 97f\n"
+ "94:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 97f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "b 97f\n"
+ "95:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 96f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "tbz x11, #0, 97f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "b 97f\n"
+ "96:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "97:" // Height 3: Partial direct writeback: Done
+ "b 99f\n"
+ "98:" // Height 3: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "99:" // Height 3: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 68b\n"
+ "b 200f\n"
+ "100:" // Height 4
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "101:" // Height 4: Column loop
+ "cbz x9, 102f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v20.16b, v8.16b\n"
- "ldr q11, [x14, #0x30]\n"
- "add x14, x14, #0x40\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v13.16b, v9.16b\n"
"mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
@@ -1307,136 +1269,137 @@ void a64_hybrid_fp32_mla_6x16 (
"mov v21.16b, v9.16b\n"
"mov v22.16b, v10.16b\n"
"mov v23.16b, v11.16b\n"
- "b 117f\n"
- "106:" // Height 4: no bias
- "tbz %x[flags], #0, 116f\n"
- "cmp x16, #0x10\n"
- "bge 115f\n"
- "tbz x16, #3, 110f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "tbz x16, #2, 108f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "tbz x16, #1, 107f\n"
+ "b 113f\n"
+ "102:" // Height 4: no bias
+ "tbz %x[flags], #0, 112f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "bge 111f\n"
+ "tbz x11, #3, 106f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "tbz x11, #2, 104f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "tbz x11, #1, 103f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "tbz x16, #0, 114f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "b 114f\n"
- "107:" // Height 4: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "tbz x11, #0, 110f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "b 110f\n"
+ "103:" // Height 4: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 114f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "b 114f\n"
- "108:" // Height 4: Partial accumulate: partial_2_8
- "tbz x16, #1, 109f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
+ "tbz x11, #0, 110f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "b 110f\n"
+ "104:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x11, #1, 105f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 114f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "b 114f\n"
- "109:" // Height 4: Partial accumulate: partial_1_8
+ "ldr d18, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "tbz x11, #0, 110f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "b 110f\n"
+ "105:" // Height 4: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 114f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "b 114f\n"
- "110:" // Height 4: Partial accumulate: partial_4_0
- "tbz x16, #2, 112f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "tbz x16, #1, 111f\n"
+ "tbz x11, #0, 110f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "b 110f\n"
+ "106:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x11, #2, 108f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "tbz x11, #1, 107f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "tbz x16, #0, 114f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "b 114f\n"
- "111:" // Height 4: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "tbz x11, #0, 110f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "b 110f\n"
+ "107:" // Height 4: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 114f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "b 114f\n"
- "112:" // Height 4: Partial accumulate: partial_2_0
- "tbz x16, #1, 113f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
+ "tbz x11, #0, 110f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "b 110f\n"
+ "108:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x11, #1, 109f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 114f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "b 114f\n"
- "113:" // Height 4: Partial accumulate: partial_1_0
+ "ldr d16, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "tbz x11, #0, 110f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
+ "b 110f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "114:" // Height 4: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "b 117f\n"
- "115:" // Height 4: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "b 117f\n"
- "116:" // Height 4: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
+ "110:" // Height 4: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 113f\n"
+ "111:" // Height 4: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "b 113f\n"
+ "112:" // Height 4: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -1453,220 +1416,220 @@ void a64_hybrid_fp32_mla_6x16 (
"movi v21.16b, #0x0\n"
"movi v22.16b, #0x0\n"
"movi v23.16b, #0x0\n"
- "117:" // Height 4: setup done
- "mov x12, #0x0\n"
- "118:" // Height 4: String loop
+ "113:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "114:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 119f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 115f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 120f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 116f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
- "b 120f\n"
- "119:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "120:" // Height 4: input setup done
- "cmp x11, #0x4\n"
- "blt 123f\n"
- "cmp x11, #0x8\n"
- "blt 122f\n"
- "121:" // Height 4: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 116f\n"
+ "115:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "116:" // Height 4: input setup done
+ "cmp x26, #0x4\n"
+ "blt 119f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x8\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 118f\n"
+ "117:" // Height 4: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x22, x22, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x26, x26, #0x4\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x8\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x11, x11, #0x4\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "cmp x11, #0x8\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
"fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
"fmla v21.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
"fmla v22.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
"fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
"fmla v20.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
"fmla v21.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
"fmla v22.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
"fmla v23.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
"fmla v20.4s, v6.4s, v3.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
"fmla v21.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
"fmla v22.4s, v6.4s, v3.s[3]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
+ "ldr q1, [x24, #0x0]\n"
"fmla v19.4s, v7.4s, v2.s[3]\n"
+ "ldr q2, [x23, #0x0]\n"
"fmla v23.4s, v7.4s, v3.s[3]\n"
- "bge 121b\n"
- "122:" // Height 4: Multiply loop: Single iteration only
- "sub x11, x11, #0x4\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "bge 117b\n"
+ "118:" // Height 4: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x4\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x22, x22, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
"fmla v20.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
"fmla v21.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
"fmla v22.4s, v6.4s, v3.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
"fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
"fmla v20.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
"fmla v21.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
"fmla v22.4s, v6.4s, v3.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
"fmla v23.4s, v7.4s, v3.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
"fmla v20.4s, v6.4s, v3.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
"fmla v21.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
- "add x15, x15, #0x100\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
@@ -1675,28 +1638,28 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v15.4s, v7.4s, v1.s[3]\n"
"fmla v19.4s, v7.4s, v2.s[3]\n"
"fmla v23.4s, v7.4s, v3.s[3]\n"
- "123:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 125f\n"
- "124:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "119:" // Height 4: Multiply loop: Main loop skip
+ "cbz x26, 121f\n"
+ "120:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x1\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "sub x11, x11, #0x1\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
@@ -1704,17 +1667,21 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
- "cbnz x11, 124b\n"
- "125:" // Height 4: Multiply loop: No odd multiplies
+ "cbnz x26, 120b\n"
+ "121:" // Height 4: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 118b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "tbz %x[flags], #1, 126f\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 114b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "tbz %x[flags], #1, 122f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1751,162 +1718,141 @@ void a64_hybrid_fp32_mla_6x16 (
"fmax v21.4s, v21.4s, v1.4s\n"
"fmax v22.4s, v22.4s, v1.4s\n"
"fmax v23.4s, v23.4s, v1.4s\n"
- "126:" // Height 4: No activation
- "cmp x16, #0x10\n"
- "bge 135f\n"
- "tbz x16, #3, 130f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "tbz x16, #2, 128f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "tbz x16, #1, 127f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "tbz x16, #0, 134f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "b 134f\n"
- "127:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x16, #0, 134f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "b 134f\n"
- "128:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x16, #1, 129f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "tbz x16, #0, 134f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "b 134f\n"
- "129:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x16, #0, 134f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "b 134f\n"
- "130:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x16, #2, 132f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "tbz x16, #1, 131f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "tbz x16, #0, 134f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "b 134f\n"
- "131:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x16, #0, 134f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "b 134f\n"
- "132:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x16, #1, 133f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "tbz x16, #0, 134f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "b 134f\n"
- "133:" // Height 4: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "134:" // Height 4: Partial direct writeback: Done
- "b 136f\n"
- "135:" // Height 4: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "136:" // Height 4: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 105b\n"
- "b 206f\n"
- "137:" // Height 5
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 138f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 139f\n"
- "138:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "139:" // Height 5: Column loop
- "cbz x14, 140f\n"
- "ldr q8, [x14, #0x0]\n"
+ "122:" // Height 4: No activation
+ "cmp x11, #0x10\n"
+ "bge 131f\n"
+ "tbz x11, #3, 126f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "tbz x11, #2, 124f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "tbz x11, #1, 123f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "tbz x11, #0, 130f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "b 130f\n"
+ "123:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 130f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "b 130f\n"
+ "124:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 125f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "tbz x11, #0, 130f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
+ "b 130f\n"
+ "125:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 130f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
+ "b 130f\n"
+ "126:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 128f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "tbz x11, #1, 127f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "tbz x11, #0, 130f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "b 130f\n"
+ "127:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 130f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
+ "b 130f\n"
+ "128:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 129f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "tbz x11, #0, 130f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "b 130f\n"
+ "129:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
+ "130:" // Height 4: Partial direct writeback: Done
+ "b 132f\n"
+ "131:" // Height 4: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "132:" // Height 4: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 101b\n"
+ "b 200f\n"
+ "133:" // Height 5
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "134:" // Height 5: Column loop
+ "cbz x9, 135f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v20.16b, v8.16b\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v24.16b, v8.16b\n"
- "add x14, x14, #0x40\n"
"mov v13.16b, v9.16b\n"
"mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
@@ -1919,157 +1865,158 @@ void a64_hybrid_fp32_mla_6x16 (
"mov v25.16b, v9.16b\n"
"mov v26.16b, v10.16b\n"
"mov v27.16b, v11.16b\n"
- "b 151f\n"
- "140:" // Height 5: no bias
- "tbz %x[flags], #0, 150f\n"
- "cmp x16, #0x10\n"
- "bge 149f\n"
- "tbz x16, #3, 144f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "tbz x16, #2, 142f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "tbz x16, #1, 141f\n"
+ "b 146f\n"
+ "135:" // Height 5: no bias
+ "tbz %x[flags], #0, 145f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "bge 144f\n"
+ "tbz x11, #3, 139f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v24.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v25.4s }, [x21], #0x10\n"
+ "tbz x11, #2, 137f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
+ "tbz x11, #1, 136f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "tbz x16, #0, 148f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "b 148f\n"
- "141:" // Height 5: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
+ "tbz x11, #0, 143f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "ld1 { v27.s }[2], [x21]\n"
+ "b 143f\n"
+ "136:" // Height 5: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 148f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "b 148f\n"
- "142:" // Height 5: Partial accumulate: partial_2_8
- "tbz x16, #1, 143f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
+ "tbz x11, #0, 143f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "ldr s27, [x21, #0x0]\n"
+ "b 143f\n"
+ "137:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x11, #1, 138f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 148f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "b 148f\n"
- "143:" // Height 5: Partial accumulate: partial_1_8
+ "ldr d18, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "tbz x11, #0, 143f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v26.s }[2], [x21]\n"
+ "b 143f\n"
+ "138:" // Height 5: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 148f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "b 148f\n"
- "144:" // Height 5: Partial accumulate: partial_4_0
- "tbz x16, #2, 146f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "tbz x16, #1, 145f\n"
+ "tbz x11, #0, 143f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s26, [x21, #0x0]\n"
+ "b 143f\n"
+ "139:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x11, #2, 141f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v24.4s }, [x21], #0x10\n"
+ "tbz x11, #1, 140f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "tbz x16, #0, 148f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "b 148f\n"
- "145:" // Height 5: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d25, [x21], #0x8\n"
+ "tbz x11, #0, 143f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "ld1 { v25.s }[2], [x21]\n"
+ "b 143f\n"
+ "140:" // Height 5: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 148f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "b 148f\n"
- "146:" // Height 5: Partial accumulate: partial_2_0
- "tbz x16, #1, 147f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
+ "tbz x11, #0, 143f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "ldr s25, [x21, #0x0]\n"
+ "b 143f\n"
+ "141:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x11, #1, 142f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 148f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "b 148f\n"
- "147:" // Height 5: Partial accumulate: partial_1_0
+ "ldr d16, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "tbz x11, #0, 143f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
+ "ld1 { v24.s }[2], [x21]\n"
+ "b 143f\n"
+ "142:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "148:" // Height 5: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "b 151f\n"
- "149:" // Height 5: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "b 151f\n"
- "150:" // Height 5: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
+ "ldr s24, [x21, #0x0]\n"
+ "143:" // Height 5: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 146f\n"
+ "144:" // Height 5: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "b 146f\n"
+ "145:" // Height 5: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -2090,260 +2037,260 @@ void a64_hybrid_fp32_mla_6x16 (
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
"movi v27.16b, #0x0\n"
- "151:" // Height 5: setup done
- "mov x12, #0x0\n"
- "152:" // Height 5: String loop
+ "146:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "147:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 153f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 148f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 154f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 149f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
"add x22, x22, x19, LSL #2\n"
- "b 154f\n"
- "153:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "add x22, x24, x19, LSL #2\n"
- "154:" // Height 5: input setup done
- "cmp x11, #0x4\n"
- "blt 157f\n"
- "cmp x11, #0x8\n"
- "blt 156f\n"
- "155:" // Height 5: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 149f\n"
+ "148:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "149:" // Height 5: input setup done
+ "cmp x26, #0x4\n"
+ "blt 152f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x8\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 151f\n"
+ "150:" // Height 5: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x21, x21, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x26, x26, #0x4\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "cmp x26, #0x8\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "sub x11, x11, #0x4\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "cmp x11, #0x8\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
"fmla v26.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
"fmla v27.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
"fmla v20.4s, v6.4s, v3.s[1]\n"
"fmla v24.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
"fmla v21.4s, v7.4s, v3.s[1]\n"
"fmla v25.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
"fmla v22.4s, v6.4s, v3.s[1]\n"
"fmla v26.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
"fmla v23.4s, v7.4s, v3.s[1]\n"
"fmla v27.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
"fmla v20.4s, v6.4s, v3.s[2]\n"
"fmla v24.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
"fmla v21.4s, v7.4s, v3.s[2]\n"
"fmla v25.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
"fmla v22.4s, v6.4s, v3.s[2]\n"
"fmla v26.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
"fmla v23.4s, v7.4s, v3.s[2]\n"
"fmla v27.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
"fmla v20.4s, v6.4s, v3.s[3]\n"
"fmla v24.4s, v6.4s, v4.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
"fmla v21.4s, v7.4s, v3.s[3]\n"
"fmla v25.4s, v7.4s, v4.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
- "add x15, x15, #0x100\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
"fmla v22.4s, v6.4s, v3.s[3]\n"
"fmla v26.4s, v6.4s, v4.s[3]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
+ "ldr q1, [x24, #0x0]\n"
"fmla v19.4s, v7.4s, v2.s[3]\n"
+ "ldr q2, [x23, #0x0]\n"
"fmla v23.4s, v7.4s, v3.s[3]\n"
+ "ldr q3, [x22, #0x0]\n"
"fmla v27.4s, v7.4s, v4.s[3]\n"
- "bge 155b\n"
- "156:" // Height 5: Multiply loop: Single iteration only
- "sub x11, x11, #0x4\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "bge 150b\n"
+ "151:" // Height 5: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x4\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x21, x21, #0x10\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x22, x22, #0x10\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
"fmla v26.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
"fmla v27.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
"fmla v20.4s, v6.4s, v3.s[1]\n"
"fmla v24.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
"fmla v21.4s, v7.4s, v3.s[1]\n"
"fmla v25.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
"fmla v22.4s, v6.4s, v3.s[1]\n"
"fmla v26.4s, v6.4s, v4.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
"fmla v23.4s, v7.4s, v3.s[1]\n"
"fmla v27.4s, v7.4s, v4.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
"fmla v20.4s, v6.4s, v3.s[2]\n"
"fmla v24.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
"fmla v21.4s, v7.4s, v3.s[2]\n"
"fmla v25.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
"fmla v22.4s, v6.4s, v3.s[2]\n"
"fmla v26.4s, v6.4s, v4.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
"fmla v23.4s, v7.4s, v3.s[2]\n"
"fmla v27.4s, v7.4s, v4.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
"fmla v20.4s, v6.4s, v3.s[3]\n"
"fmla v24.4s, v6.4s, v4.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
"fmla v21.4s, v7.4s, v3.s[3]\n"
"fmla v25.4s, v7.4s, v4.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
- "add x15, x15, #0x100\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
"fmla v22.4s, v6.4s, v3.s[3]\n"
@@ -2353,31 +2300,31 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v19.4s, v7.4s, v2.s[3]\n"
"fmla v23.4s, v7.4s, v3.s[3]\n"
"fmla v27.4s, v7.4s, v4.s[3]\n"
- "157:" // Height 5: Multiply loop: Main loop skip
- "cbz x11, 159f\n"
- "158:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "152:" // Height 5: Multiply loop: Main loop skip
+ "cbz x26, 154f\n"
+ "153:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x1\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "sub x11, x11, #0x1\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
@@ -2387,18 +2334,23 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v19.4s, v7.4s, v2.s[0]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
"fmla v27.4s, v7.4s, v4.s[0]\n"
- "cbnz x11, 158b\n"
- "159:" // Height 5: Multiply loop: No odd multiplies
+ "cbnz x26, 153b\n"
+ "154:" // Height 5: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 152b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 147b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "tbz %x[flags], #1, 160f\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 155f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -2443,193 +2395,169 @@ void a64_hybrid_fp32_mla_6x16 (
"fmax v26.4s, v26.4s, v1.4s\n"
"fmin v27.4s, v27.4s, v0.4s\n"
"fmax v27.4s, v27.4s, v1.4s\n"
- "160:" // Height 5: No activation
- "cmp x16, #0x10\n"
- "bge 169f\n"
- "tbz x16, #3, 164f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "tbz x16, #2, 162f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "tbz x16, #1, 161f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "tbz x16, #0, 168f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "st1 { v27.s }[2], [x23]\n"
- "b 168f\n"
- "161:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x16, #0, 168f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "b 168f\n"
- "162:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x16, #1, 163f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "tbz x16, #0, 168f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v26.s }[2], [x23]\n"
- "b 168f\n"
- "163:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x16, #0, 168f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "b 168f\n"
- "164:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x16, #2, 166f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "tbz x16, #1, 165f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "tbz x16, #0, 168f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v25.s }[2], [x23]\n"
- "b 168f\n"
- "165:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x16, #0, 168f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "b 168f\n"
- "166:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x16, #1, 167f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x16, #0, 168f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v24.s }[2], [x23]\n"
- "b 168f\n"
- "167:" // Height 5: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "168:" // Height 5: Partial direct writeback: Done
- "b 170f\n"
- "169:" // Height 5: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "170:" // Height 5: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 139b\n"
- "b 206f\n"
- "171:" // Height 6
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 172f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 173f\n"
- "172:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "173:" // Height 6: Column loop
- "cbz x14, 174f\n"
- "ldr q8, [x14, #0x0]\n"
+ "155:" // Height 5: No activation
+ "cmp x11, #0x10\n"
+ "bge 164f\n"
+ "tbz x11, #3, 159f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "st1 { v25.4s }, [x21], #0x10\n"
+ "tbz x11, #2, 157f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
+ "tbz x11, #1, 156f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
+ "tbz x11, #0, 163f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
+ "b 163f\n"
+ "156:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 163f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "str s27, [x21, #0x0]\n"
+ "b 163f\n"
+ "157:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 158f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
+ "tbz x11, #0, 163f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "b 163f\n"
+ "158:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 163f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
+ "str s26, [x21, #0x0]\n"
+ "b 163f\n"
+ "159:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 161f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "tbz x11, #1, 160f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
+ "tbz x11, #0, 163f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
+ "b 163f\n"
+ "160:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 163f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
+ "str s25, [x21, #0x0]\n"
+ "b 163f\n"
+ "161:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 162f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
+ "tbz x11, #0, 163f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "st1 { v24.s }[2], [x21]\n"
+ "b 163f\n"
+ "162:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
+ "str s24, [x21, #0x0]\n"
+ "163:" // Height 5: Partial direct writeback: Done
+ "b 165f\n"
+ "164:" // Height 5: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q25, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
+ "165:" // Height 5: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 134b\n"
+ "b 200f\n"
+ "166:" // Height 6
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0x18\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "167:" // Height 6: Column loop
+ "cbz x9, 168f\n"
+ "ldr q8, [x9, #0x0]\n"
"mov v12.16b, v8.16b\n"
- "ldr q9, [x14, #0x10]\n"
+ "ldr q9, [x9, #0x10]\n"
"mov v16.16b, v8.16b\n"
- "ldr q10, [x14, #0x20]\n"
+ "ldr q10, [x9, #0x20]\n"
"mov v20.16b, v8.16b\n"
- "ldr q11, [x14, #0x30]\n"
+ "ldr q11, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
"mov v24.16b, v8.16b\n"
- "add x14, x14, #0x40\n"
"mov v28.16b, v8.16b\n"
"mov v13.16b, v9.16b\n"
- "mov v17.16b, v9.16b\n"
"mov v14.16b, v10.16b\n"
"mov v15.16b, v11.16b\n"
+ "mov v17.16b, v9.16b\n"
"mov v18.16b, v10.16b\n"
"mov v19.16b, v11.16b\n"
"mov v21.16b, v9.16b\n"
@@ -2641,178 +2569,179 @@ void a64_hybrid_fp32_mla_6x16 (
"mov v29.16b, v9.16b\n"
"mov v30.16b, v10.16b\n"
"mov v31.16b, v11.16b\n"
- "b 185f\n"
- "174:" // Height 6: no bias
- "tbz %x[flags], #0, 184f\n"
- "cmp x16, #0x10\n"
- "bge 183f\n"
- "tbz x16, #3, 178f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x21], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x21], #0x10\n"
- "tbz x16, #2, 176f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x21], #0x10\n"
- "tbz x16, #1, 175f\n"
+ "b 179f\n"
+ "168:" // Height 6: no bias
+ "tbz %x[flags], #0, 178f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x11, #0x10\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "bge 177f\n"
+ "tbz x11, #3, 172f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v24.4s }, [x21], #0x10\n"
+ "ld1 { v28.4s }, [x20], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x24], #0x10\n"
+ "ld1 { v17.4s }, [x23], #0x10\n"
+ "ld1 { v21.4s }, [x22], #0x10\n"
+ "ld1 { v25.4s }, [x21], #0x10\n"
+ "ld1 { v29.4s }, [x20], #0x10\n"
+ "tbz x11, #2, 170f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x24], #0x10\n"
+ "ld1 { v18.4s }, [x23], #0x10\n"
+ "ld1 { v22.4s }, [x22], #0x10\n"
+ "ld1 { v26.4s }, [x21], #0x10\n"
+ "ld1 { v30.4s }, [x20], #0x10\n"
+ "tbz x11, #1, 169f\n"
"mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x21], #0x8\n"
- "tbz x16, #0, 182f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x21]\n"
- "b 182f\n"
- "175:" // Height 6: Partial accumulate: partial_1_12
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x24], #0x8\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d23, [x22], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
+ "ldr d31, [x20], #0x8\n"
+ "tbz x11, #0, 176f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x24]\n"
+ "ld1 { v19.s }[2], [x23]\n"
+ "ld1 { v23.s }[2], [x22]\n"
+ "ld1 { v27.s }[2], [x21]\n"
+ "ld1 { v31.s }[2], [x20]\n"
+ "b 176f\n"
+ "169:" // Height 6: Partial accumulate: partial_1_12
"mov x19, #0x30\n"
- "tbz x16, #0, 182f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s31, [x21, #0x0]\n"
- "b 182f\n"
- "176:" // Height 6: Partial accumulate: partial_2_8
- "tbz x16, #1, 177f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x21], #0x8\n"
+ "tbz x11, #0, 176f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x24, #0x0]\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s23, [x22, #0x0]\n"
+ "ldr s27, [x21, #0x0]\n"
+ "ldr s31, [x20, #0x0]\n"
+ "b 176f\n"
+ "170:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x11, #1, 171f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x24], #0x8\n"
"mov x19, #0x28\n"
- "tbz x16, #0, 182f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x21]\n"
- "b 182f\n"
- "177:" // Height 6: Partial accumulate: partial_1_8
+ "ldr d18, [x23], #0x8\n"
+ "ldr d22, [x22], #0x8\n"
+ "ldr d26, [x21], #0x8\n"
+ "ldr d30, [x20], #0x8\n"
+ "tbz x11, #0, 176f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x24]\n"
+ "ld1 { v18.s }[2], [x23]\n"
+ "ld1 { v22.s }[2], [x22]\n"
+ "ld1 { v26.s }[2], [x21]\n"
+ "ld1 { v30.s }[2], [x20]\n"
+ "b 176f\n"
+ "171:" // Height 6: Partial accumulate: partial_1_8
"mov x19, #0x20\n"
- "tbz x16, #0, 182f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s30, [x21, #0x0]\n"
- "b 182f\n"
- "178:" // Height 6: Partial accumulate: partial_4_0
- "tbz x16, #2, 180f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x21], #0x10\n"
- "tbz x16, #1, 179f\n"
+ "tbz x11, #0, 176f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x24, #0x0]\n"
+ "ldr s18, [x23, #0x0]\n"
+ "ldr s22, [x22, #0x0]\n"
+ "ldr s26, [x21, #0x0]\n"
+ "ldr s30, [x20, #0x0]\n"
+ "b 176f\n"
+ "172:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x11, #2, 174f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x24], #0x10\n"
+ "ld1 { v16.4s }, [x23], #0x10\n"
+ "ld1 { v20.4s }, [x22], #0x10\n"
+ "ld1 { v24.4s }, [x21], #0x10\n"
+ "ld1 { v28.4s }, [x20], #0x10\n"
+ "tbz x11, #1, 173f\n"
"mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x21], #0x8\n"
- "tbz x16, #0, 182f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x21]\n"
- "b 182f\n"
- "179:" // Height 6: Partial accumulate: partial_1_4
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d17, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d25, [x21], #0x8\n"
+ "ldr d29, [x20], #0x8\n"
+ "tbz x11, #0, 176f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v17.s }[2], [x23]\n"
+ "ld1 { v21.s }[2], [x22]\n"
+ "ld1 { v25.s }[2], [x21]\n"
+ "ld1 { v29.s }[2], [x20]\n"
+ "b 176f\n"
+ "173:" // Height 6: Partial accumulate: partial_1_4
"mov x19, #0x10\n"
- "tbz x16, #0, 182f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s29, [x21, #0x0]\n"
- "b 182f\n"
- "180:" // Height 6: Partial accumulate: partial_2_0
- "tbz x16, #1, 181f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x21], #0x8\n"
+ "tbz x11, #0, 176f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s17, [x23, #0x0]\n"
+ "ldr s21, [x22, #0x0]\n"
+ "ldr s25, [x21, #0x0]\n"
+ "ldr s29, [x20, #0x0]\n"
+ "b 176f\n"
+ "174:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x11, #1, 175f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
"mov x19, #0x8\n"
- "tbz x16, #0, 182f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x21]\n"
- "b 182f\n"
- "181:" // Height 6: Partial accumulate: partial_1_0
+ "ldr d16, [x23], #0x8\n"
+ "ldr d20, [x22], #0x8\n"
+ "ldr d24, [x21], #0x8\n"
+ "ldr d28, [x20], #0x8\n"
+ "tbz x11, #0, 176f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v16.s }[2], [x23]\n"
+ "ld1 { v20.s }[2], [x22]\n"
+ "ld1 { v24.s }[2], [x21]\n"
+ "ld1 { v28.s }[2], [x20]\n"
+ "b 176f\n"
+ "175:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s28, [x21, #0x0]\n"
- "182:" // Height 6: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "sub x21, x21, x19\n"
- "b 185f\n"
- "183:" // Height 6: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x21, #0x0]\n"
- "ldr q29, [x21, #0x10]\n"
- "ldr q30, [x21, #0x20]\n"
- "ldr q31, [x21, #0x30]\n"
- "b 185f\n"
- "184:" // Height 6: no accumulate
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s16, [x23, #0x0]\n"
+ "ldr s20, [x22, #0x0]\n"
+ "ldr s24, [x21, #0x0]\n"
+ "ldr s28, [x20, #0x0]\n"
+ "176:" // Height 6: Partial accumulate: Done
+ "sub x28, x28, x19\n"
+ "b 179f\n"
+ "177:" // Height 6: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x24, #0x0]\n"
+ "ldr q13, [x24, #0x10]\n"
+ "ldr q14, [x24, #0x20]\n"
+ "ldr q15, [x24, #0x30]\n"
+ "ldr q16, [x23, #0x0]\n"
+ "ldr q17, [x23, #0x10]\n"
+ "ldr q18, [x23, #0x20]\n"
+ "ldr q19, [x23, #0x30]\n"
+ "ldr q20, [x22, #0x0]\n"
+ "ldr q21, [x22, #0x10]\n"
+ "ldr q22, [x22, #0x20]\n"
+ "ldr q23, [x22, #0x30]\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x20, #0x0]\n"
+ "ldr q29, [x20, #0x10]\n"
+ "ldr q30, [x20, #0x20]\n"
+ "ldr q31, [x20, #0x30]\n"
+ "b 179f\n"
+ "178:" // Height 6: no accumulate
"movi v8.16b, #0x0\n"
"movi v9.16b, #0x0\n"
"movi v10.16b, #0x0\n"
@@ -2837,299 +2766,299 @@ void a64_hybrid_fp32_mla_6x16 (
"movi v29.16b, #0x0\n"
"movi v30.16b, #0x0\n"
"movi v31.16b, #0x0\n"
- "185:" // Height 6: setup done
- "mov x12, #0x0\n"
- "186:" // Height 6: String loop
+ "179:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "180:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 187f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 181f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 188f\n"
+ "cbnz x27, 182f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
"add x22, x22, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
"add x20, x20, x19, LSL #2\n"
- "b 188f\n"
- "187:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "add x22, x24, x19, LSL #2\n"
- "add x20, x22, x19, LSL #2\n"
- "188:" // Height 6: input setup done
- "cmp x11, #0x4\n"
- "blt 191f\n"
- "cmp x11, #0x8\n"
- "blt 190f\n"
- "189:" // Height 6: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
+ "b 182f\n"
+ "181:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "182:" // Height 6: input setup done
+ "cmp x26, #0x4\n"
+ "blt 185f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x8\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
+ "blt 184f\n"
+ "183:" // Height 6: Multiply loop: Main loop head
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x21, x21, #0x10\n"
"fmla v28.4s, v6.4s, v5.s[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x20, x20, #0x10\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "sub x26, x26, #0x4\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
- "add x20, x20, #0x10\n"
- "fmla v17.4s, v7.4s, v2.s[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "sub x11, x11, #0x4\n"
+ "cmp x26, #0x8\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
- "cmp x11, #0x8\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
"fmla v29.4s, v7.4s, v5.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
"fmla v26.4s, v6.4s, v4.s[0]\n"
"fmla v30.4s, v6.4s, v5.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
"fmla v27.4s, v7.4s, v4.s[0]\n"
"fmla v31.4s, v7.4s, v5.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
"fmla v20.4s, v6.4s, v3.s[1]\n"
"fmla v24.4s, v6.4s, v4.s[1]\n"
"fmla v28.4s, v6.4s, v5.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
"fmla v21.4s, v7.4s, v3.s[1]\n"
"fmla v25.4s, v7.4s, v4.s[1]\n"
"fmla v29.4s, v7.4s, v5.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
"fmla v22.4s, v6.4s, v3.s[1]\n"
"fmla v26.4s, v6.4s, v4.s[1]\n"
"fmla v30.4s, v6.4s, v5.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
"fmla v23.4s, v7.4s, v3.s[1]\n"
"fmla v27.4s, v7.4s, v4.s[1]\n"
"fmla v31.4s, v7.4s, v5.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
"fmla v20.4s, v6.4s, v3.s[2]\n"
"fmla v24.4s, v6.4s, v4.s[2]\n"
"fmla v28.4s, v6.4s, v5.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
"fmla v21.4s, v7.4s, v3.s[2]\n"
"fmla v25.4s, v7.4s, v4.s[2]\n"
"fmla v29.4s, v7.4s, v5.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
"fmla v22.4s, v6.4s, v3.s[2]\n"
"fmla v26.4s, v6.4s, v4.s[2]\n"
"fmla v30.4s, v6.4s, v5.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
"fmla v23.4s, v7.4s, v3.s[2]\n"
"fmla v27.4s, v7.4s, v4.s[2]\n"
"fmla v31.4s, v7.4s, v5.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
"fmla v20.4s, v6.4s, v3.s[3]\n"
"fmla v24.4s, v6.4s, v4.s[3]\n"
"fmla v28.4s, v6.4s, v5.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
"fmla v21.4s, v7.4s, v3.s[3]\n"
"fmla v25.4s, v7.4s, v4.s[3]\n"
"fmla v29.4s, v7.4s, v5.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
- "add x15, x15, #0x100\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
"fmla v22.4s, v6.4s, v3.s[3]\n"
"fmla v26.4s, v6.4s, v4.s[3]\n"
"fmla v30.4s, v6.4s, v5.s[3]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v11.4s, v7.4s, v0.s[3]\n"
+ "ldr q0, [x25, #0x0]\n"
"fmla v15.4s, v7.4s, v1.s[3]\n"
+ "ldr q1, [x24, #0x0]\n"
"fmla v19.4s, v7.4s, v2.s[3]\n"
+ "ldr q2, [x23, #0x0]\n"
"fmla v23.4s, v7.4s, v3.s[3]\n"
+ "ldr q3, [x22, #0x0]\n"
"fmla v27.4s, v7.4s, v4.s[3]\n"
+ "ldr q4, [x21, #0x0]\n"
"fmla v31.4s, v7.4s, v5.s[3]\n"
- "bge 189b\n"
- "190:" // Height 6: Multiply loop: Single iteration only
- "sub x11, x11, #0x4\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x15, #0x0]\n"
+ "bge 183b\n"
+ "184:" // Height 6: Multiply loop: Single iteration only
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
+ "sub x26, x26, #0x4\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
- "add x28, x28, #0x10\n"
- "fmla v20.4s, v6.4s, v3.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- "fmla v24.4s, v6.4s, v4.s[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"add x24, x24, #0x10\n"
- "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x15, #0x20]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"add x22, x22, #0x10\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x21, x21, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q6, [x10, #0x20]\n"
"add x20, x20, #0x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
"fmla v29.4s, v7.4s, v5.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
"fmla v26.4s, v6.4s, v4.s[0]\n"
"fmla v30.4s, v6.4s, v5.s[0]\n"
- "ldr q6, [x15, #0x40]\n"
+ "ldr q6, [x10, #0x40]\n"
"fmla v11.4s, v7.4s, v0.s[0]\n"
"fmla v15.4s, v7.4s, v1.s[0]\n"
"fmla v19.4s, v7.4s, v2.s[0]\n"
"fmla v23.4s, v7.4s, v3.s[0]\n"
"fmla v27.4s, v7.4s, v4.s[0]\n"
"fmla v31.4s, v7.4s, v5.s[0]\n"
- "ldr q7, [x15, #0x50]\n"
+ "ldr q7, [x10, #0x50]\n"
"fmla v8.4s, v6.4s, v0.s[1]\n"
"fmla v12.4s, v6.4s, v1.s[1]\n"
"fmla v16.4s, v6.4s, v2.s[1]\n"
"fmla v20.4s, v6.4s, v3.s[1]\n"
"fmla v24.4s, v6.4s, v4.s[1]\n"
"fmla v28.4s, v6.4s, v5.s[1]\n"
- "ldr q6, [x15, #0x60]\n"
+ "ldr q6, [x10, #0x60]\n"
"fmla v9.4s, v7.4s, v0.s[1]\n"
"fmla v13.4s, v7.4s, v1.s[1]\n"
"fmla v17.4s, v7.4s, v2.s[1]\n"
"fmla v21.4s, v7.4s, v3.s[1]\n"
"fmla v25.4s, v7.4s, v4.s[1]\n"
"fmla v29.4s, v7.4s, v5.s[1]\n"
- "ldr q7, [x15, #0x70]\n"
+ "ldr q7, [x10, #0x70]\n"
"fmla v10.4s, v6.4s, v0.s[1]\n"
"fmla v14.4s, v6.4s, v1.s[1]\n"
"fmla v18.4s, v6.4s, v2.s[1]\n"
"fmla v22.4s, v6.4s, v3.s[1]\n"
"fmla v26.4s, v6.4s, v4.s[1]\n"
"fmla v30.4s, v6.4s, v5.s[1]\n"
- "ldr q6, [x15, #0x80]\n"
+ "ldr q6, [x10, #0x80]\n"
"fmla v11.4s, v7.4s, v0.s[1]\n"
"fmla v15.4s, v7.4s, v1.s[1]\n"
"fmla v19.4s, v7.4s, v2.s[1]\n"
"fmla v23.4s, v7.4s, v3.s[1]\n"
"fmla v27.4s, v7.4s, v4.s[1]\n"
"fmla v31.4s, v7.4s, v5.s[1]\n"
- "ldr q7, [x15, #0x90]\n"
+ "ldr q7, [x10, #0x90]\n"
"fmla v8.4s, v6.4s, v0.s[2]\n"
"fmla v12.4s, v6.4s, v1.s[2]\n"
"fmla v16.4s, v6.4s, v2.s[2]\n"
"fmla v20.4s, v6.4s, v3.s[2]\n"
"fmla v24.4s, v6.4s, v4.s[2]\n"
"fmla v28.4s, v6.4s, v5.s[2]\n"
- "ldr q6, [x15, #0xa0]\n"
+ "ldr q6, [x10, #0xa0]\n"
"fmla v9.4s, v7.4s, v0.s[2]\n"
"fmla v13.4s, v7.4s, v1.s[2]\n"
"fmla v17.4s, v7.4s, v2.s[2]\n"
"fmla v21.4s, v7.4s, v3.s[2]\n"
"fmla v25.4s, v7.4s, v4.s[2]\n"
"fmla v29.4s, v7.4s, v5.s[2]\n"
- "ldr q7, [x15, #0xb0]\n"
+ "ldr q7, [x10, #0xb0]\n"
"fmla v10.4s, v6.4s, v0.s[2]\n"
"fmla v14.4s, v6.4s, v1.s[2]\n"
"fmla v18.4s, v6.4s, v2.s[2]\n"
"fmla v22.4s, v6.4s, v3.s[2]\n"
"fmla v26.4s, v6.4s, v4.s[2]\n"
"fmla v30.4s, v6.4s, v5.s[2]\n"
- "ldr q6, [x15, #0xc0]\n"
+ "ldr q6, [x10, #0xc0]\n"
"fmla v11.4s, v7.4s, v0.s[2]\n"
"fmla v15.4s, v7.4s, v1.s[2]\n"
"fmla v19.4s, v7.4s, v2.s[2]\n"
"fmla v23.4s, v7.4s, v3.s[2]\n"
"fmla v27.4s, v7.4s, v4.s[2]\n"
"fmla v31.4s, v7.4s, v5.s[2]\n"
- "ldr q7, [x15, #0xd0]\n"
+ "ldr q7, [x10, #0xd0]\n"
"fmla v8.4s, v6.4s, v0.s[3]\n"
"fmla v12.4s, v6.4s, v1.s[3]\n"
"fmla v16.4s, v6.4s, v2.s[3]\n"
"fmla v20.4s, v6.4s, v3.s[3]\n"
"fmla v24.4s, v6.4s, v4.s[3]\n"
"fmla v28.4s, v6.4s, v5.s[3]\n"
- "ldr q6, [x15, #0xe0]\n"
+ "ldr q6, [x10, #0xe0]\n"
"fmla v9.4s, v7.4s, v0.s[3]\n"
"fmla v13.4s, v7.4s, v1.s[3]\n"
"fmla v17.4s, v7.4s, v2.s[3]\n"
"fmla v21.4s, v7.4s, v3.s[3]\n"
"fmla v25.4s, v7.4s, v4.s[3]\n"
"fmla v29.4s, v7.4s, v5.s[3]\n"
- "ldr q7, [x15, #0xf0]\n"
+ "ldr q7, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
"fmla v10.4s, v6.4s, v0.s[3]\n"
- "add x15, x15, #0x100\n"
"fmla v14.4s, v6.4s, v1.s[3]\n"
"fmla v18.4s, v6.4s, v2.s[3]\n"
"fmla v22.4s, v6.4s, v3.s[3]\n"
@@ -3141,34 +3070,34 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v23.4s, v7.4s, v3.s[3]\n"
"fmla v27.4s, v7.4s, v4.s[3]\n"
"fmla v31.4s, v7.4s, v5.s[3]\n"
- "191:" // Height 6: Multiply loop: Main loop skip
- "cbz x11, 193f\n"
- "192:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
+ "185:" // Height 6: Multiply loop: Main loop skip
+ "cbz x26, 187f\n"
+ "186:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x1\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
"ldr s5, [x20], #0x4\n"
- "ldr q6, [x15, #0x0]\n"
+ "ldr q6, [x10, #0x0]\n"
"fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr q7, [x15, #0x10]\n"
+ "ldr q7, [x10, #0x10]\n"
"fmla v12.4s, v6.4s, v1.s[0]\n"
- "sub x11, x11, #0x1\n"
"fmla v16.4s, v6.4s, v2.s[0]\n"
"fmla v20.4s, v6.4s, v3.s[0]\n"
"fmla v24.4s, v6.4s, v4.s[0]\n"
"fmla v28.4s, v6.4s, v5.s[0]\n"
- "ldr q6, [x15, #0x20]\n"
+ "ldr q6, [x10, #0x20]\n"
"fmla v9.4s, v7.4s, v0.s[0]\n"
"fmla v13.4s, v7.4s, v1.s[0]\n"
"fmla v17.4s, v7.4s, v2.s[0]\n"
"fmla v21.4s, v7.4s, v3.s[0]\n"
"fmla v25.4s, v7.4s, v4.s[0]\n"
"fmla v29.4s, v7.4s, v5.s[0]\n"
- "ldr q7, [x15, #0x30]\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
"fmla v10.4s, v6.4s, v0.s[0]\n"
- "add x15, x15, #0x40\n"
"fmla v14.4s, v6.4s, v1.s[0]\n"
"fmla v18.4s, v6.4s, v2.s[0]\n"
"fmla v22.4s, v6.4s, v3.s[0]\n"
@@ -3180,19 +3109,25 @@ void a64_hybrid_fp32_mla_6x16 (
"fmla v23.4s, v7.4s, v3.s[0]\n"
"fmla v27.4s, v7.4s, v4.s[0]\n"
"fmla v31.4s, v7.4s, v5.s[0]\n"
- "cbnz x11, 192b\n"
- "193:" // Height 6: Multiply loop: No odd multiplies
+ "cbnz x26, 186b\n"
+ "187:" // Height 6: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 186b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 180b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "tbz %x[flags], #1, 194f\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "tbz %x[flags], #1, 188f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v1.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -3245,185 +3180,180 @@ void a64_hybrid_fp32_mla_6x16 (
"fmin v31.4s, v31.4s, v0.4s\n"
"fmax v30.4s, v30.4s, v1.4s\n"
"fmax v31.4s, v31.4s, v1.4s\n"
- "194:" // Height 6: No activation
- "cmp x16, #0x10\n"
- "bge 203f\n"
- "tbz x16, #3, 198f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x21], #0x10\n"
- "st1 { v29.4s }, [x21], #0x10\n"
- "tbz x16, #2, 196f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x21], #0x10\n"
- "tbz x16, #1, 195f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x21], #0x8\n"
- "tbz x16, #0, 202f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x21]\n"
- "b 202f\n"
- "195:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x16, #0, 202f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s31, [x21, #0x0]\n"
- "b 202f\n"
- "196:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x16, #1, 197f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x21], #0x8\n"
- "tbz x16, #0, 202f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x21]\n"
- "b 202f\n"
- "197:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x16, #0, 202f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s30, [x21, #0x0]\n"
- "b 202f\n"
- "198:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x16, #2, 200f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x21], #0x10\n"
- "tbz x16, #1, 199f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x21], #0x8\n"
- "tbz x16, #0, 202f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x21]\n"
- "b 202f\n"
- "199:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x16, #0, 202f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s29, [x21, #0x0]\n"
- "b 202f\n"
- "200:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x16, #1, 201f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x21], #0x8\n"
- "tbz x16, #0, 202f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x21]\n"
- "b 202f\n"
- "201:" // Height 6: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s28, [x21, #0x0]\n"
- "202:" // Height 6: Partial direct writeback: Done
- "b 204f\n"
- "203:" // Height 6: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x21, #0x0]\n"
- "str q29, [x21, #0x10]\n"
- "str q30, [x21, #0x20]\n"
- "str q31, [x21, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "add x21, x21, #0x40\n"
- "204:" // Height 6: Writeback done
- "subs x16, x16, #0x10\n"
- "bgt 173b\n"
+ "188:" // Height 6: No activation
+ "cmp x11, #0x10\n"
+ "bge 197f\n"
+ "tbz x11, #3, 192f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v13.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v17.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v21.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "st1 { v25.4s }, [x21], #0x10\n"
+ "st1 { v28.4s }, [x20], #0x10\n"
+ "st1 { v29.4s }, [x20], #0x10\n"
+ "tbz x11, #2, 190f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x24], #0x10\n"
+ "st1 { v18.4s }, [x23], #0x10\n"
+ "st1 { v22.4s }, [x22], #0x10\n"
+ "st1 { v26.4s }, [x21], #0x10\n"
+ "st1 { v30.4s }, [x20], #0x10\n"
+ "tbz x11, #1, 189f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x24], #0x8\n"
+ "str d19, [x23], #0x8\n"
+ "str d23, [x22], #0x8\n"
+ "str d27, [x21], #0x8\n"
+ "str d31, [x20], #0x8\n"
+ "tbz x11, #0, 196f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 196f\n"
+ "189:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 196f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x24, #0x0]\n"
+ "str s19, [x23, #0x0]\n"
+ "str s23, [x22, #0x0]\n"
+ "str s27, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
+ "b 196f\n"
+ "190:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 191f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x24], #0x8\n"
+ "str d18, [x23], #0x8\n"
+ "str d22, [x22], #0x8\n"
+ "str d26, [x21], #0x8\n"
+ "str d30, [x20], #0x8\n"
+ "tbz x11, #0, 196f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x24]\n"
+ "st1 { v18.s }[2], [x23]\n"
+ "st1 { v22.s }[2], [x22]\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "st1 { v30.s }[2], [x20]\n"
+ "b 196f\n"
+ "191:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 196f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x24, #0x0]\n"
+ "str s18, [x23, #0x0]\n"
+ "str s22, [x22, #0x0]\n"
+ "str s26, [x21, #0x0]\n"
+ "str s30, [x20, #0x0]\n"
+ "b 196f\n"
+ "192:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 194f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x24], #0x10\n"
+ "st1 { v16.4s }, [x23], #0x10\n"
+ "st1 { v20.4s }, [x22], #0x10\n"
+ "st1 { v24.4s }, [x21], #0x10\n"
+ "st1 { v28.4s }, [x20], #0x10\n"
+ "tbz x11, #1, 193f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x24], #0x8\n"
+ "str d17, [x23], #0x8\n"
+ "str d21, [x22], #0x8\n"
+ "str d25, [x21], #0x8\n"
+ "str d29, [x20], #0x8\n"
+ "tbz x11, #0, 196f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x24]\n"
+ "st1 { v17.s }[2], [x23]\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "st1 { v25.s }[2], [x21]\n"
+ "st1 { v29.s }[2], [x20]\n"
+ "b 196f\n"
+ "193:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 196f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x24, #0x0]\n"
+ "str s17, [x23, #0x0]\n"
+ "str s21, [x22, #0x0]\n"
+ "str s25, [x21, #0x0]\n"
+ "str s29, [x20, #0x0]\n"
+ "b 196f\n"
+ "194:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 195f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x24], #0x8\n"
+ "str d16, [x23], #0x8\n"
+ "str d20, [x22], #0x8\n"
+ "str d24, [x21], #0x8\n"
+ "str d28, [x20], #0x8\n"
+ "tbz x11, #0, 196f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "st1 { v24.s }[2], [x21]\n"
+ "st1 { v28.s }[2], [x20]\n"
+ "b 196f\n"
+ "195:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x24, #0x0]\n"
+ "str s16, [x23, #0x0]\n"
+ "str s20, [x22, #0x0]\n"
+ "str s24, [x21, #0x0]\n"
+ "str s28, [x20, #0x0]\n"
+ "196:" // Height 6: Partial direct writeback: Done
+ "b 198f\n"
+ "197:" // Height 6: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x24, #0x0]\n"
+ "str q13, [x24, #0x10]\n"
+ "str q14, [x24, #0x20]\n"
+ "str q15, [x24, #0x30]\n"
+ "str q16, [x23, #0x0]\n"
+ "str q17, [x23, #0x10]\n"
+ "str q18, [x23, #0x20]\n"
+ "str q19, [x23, #0x30]\n"
+ "str q20, [x22, #0x0]\n"
+ "str q21, [x22, #0x10]\n"
+ "str q22, [x22, #0x20]\n"
+ "str q23, [x22, #0x30]\n"
+ "str q24, [x21, #0x0]\n"
+ "str q25, [x21, #0x10]\n"
+ "str q26, [x21, #0x20]\n"
+ "str q27, [x21, #0x30]\n"
+ "str q28, [x20, #0x0]\n"
+ "str q29, [x20, #0x10]\n"
+ "str q30, [x20, #0x20]\n"
+ "str q31, [x20, #0x30]\n"
+ "198:" // Height 6: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 167b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 206f\n"
+ "beq 200f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 205f\n"
+ "tbz %x[flags], #3, 199f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "205:" // Update direct input
+ "199:" // Update direct input
"mov x19, #0x18\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "206:" // Exit
+ "200:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
index 2b5cdae652..957754ad68 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -36,9 +36,9 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void a64_hybrid_fp32_mla_8x4( ARGLIST );
+void a64_hybrid_fp32_mla_8x4_a55( ARGLIST );
class cls_a64_hybrid_fp32_mla_8x4
{
@@ -73,9 +73,16 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_fp32_mla_8x4;
-
- cls_a64_hybrid_fp32_mla_8x4(const CPUInfo *)
+ cls_a64_hybrid_fp32_mla_8x4(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A55r1:
+ case CPUModel::A53:
+ kernel=a64_hybrid_fp32_mla_8x4_a55;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
new file mode 100644
index 0000000000..99920002b2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp
@@ -0,0 +1,2215 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_8x4_a55 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x8\n"
+ "bge 148f\n"
+ "cmp %x[M], #0x6\n"
+ "bgt 127f\n"
+ "beq 106f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 85f\n"
+ "beq 64f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 43f\n"
+ "beq 22f\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x15, %x[bias]\n"
+ "mov x14, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "cbz x15, 3f\n"
+ "ldr q24, [x15, #0x0]\n"
+ "add x15, x15, #0x10\n"
+ "b 8f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 7f\n"
+ "cmp x17, #0x4\n"
+ "bge 6f\n"
+ "tbz x17, #1, 4f\n"
+ "ldr d24, [x14], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x17, #0, 5f\n"
+ "ld1 { v24.s }[2], [x14]\n"
+ "b 5f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_0
+ "ldr s24, [x14, #0x0]\n"
+ "mov x19, #0x0\n"
+ "5:" // Height 1: Partial accumulate: Done
+ "sub x14, x14, x19\n"
+ "b 8f\n"
+ "6:" // Height 1: full accumulate
+ "ldr q24, [x14, #0x0]\n"
+ "b 8f\n"
+ "7:" // Height 1: no accumulate
+ "movi v24.16b, #0x0\n"
+ "8:" // Height 1: setup done
+ "mov x13, #0x0\n"
+ "9:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w12, [x20, x13, LSL #0x2]\n"
+ "tbz %x[flags], #3, 10f\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "cbnz x13, 11f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "b 11f\n"
+ "10:" // Height 1: setup direct input
+ "mov x11, %x[input_ptr]\n"
+ "11:" // Height 1: input setup done
+ "cmp x12, #0x4\n"
+ "blt 14f\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q8, [x16, #0x0]\n"
+ "cmp x12, #0x8\n"
+ "blt 13f\n"
+ "12:" // Height 1: Multiply loop: Main loop head
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr d9, [x16, #0x10]\n"
+ "ldr x19, [x16, #0x18]\n"
+ "add x11, x11, #0x10\n"
+ "ldr d10, [x16, #0x20]\n"
+ "sub x12, x12, #0x4\n"
+ "ldr x21, [x16, #0x28]\n"
+ "cmp x12, #0x8\n"
+ "mov v9.d[1], x19\n"
+ "ldr d11, [x16, #0x30]\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x16, x16, #0x40\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "mov v10.d[1], x21\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v11.d[1], x19\n"
+ "ldr d8, [x16, #0x0]\n"
+ "ldr x26, [x16, #0x8]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "ldr x10, [x11, #0x8]\n"
+ "mov v8.d[1], x26\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr d0, [x11, #0x0]\n"
+ "mov v0.d[1], x10\n"
+ "bge 12b\n"
+ "13:" // Height 1: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "sub x12, x12, #0x4\n"
+ "ldr q11, [x16, #0x30]\n"
+ "add x11, x11, #0x10\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "add x16, x16, #0x40\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "14:" // Height 1: Multiply loop: Main loop skip
+ "cbz x12, 16f\n"
+ "15:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x11], #0x4\n"
+ "sub x12, x12, #0x1\n"
+ "ldr q12, [x16, #0x0]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "cbnz x12, 15b\n"
+ "16:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x13, x13, #0x1\n"
+ "cmp x13, x19\n"
+ "bne 9b\n"
+ "prfm pstl1keep, [x14, #0x0]\n"
+ "tbz %x[flags], #1, 17f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "17:" // Height 1: No activation
+ "cmp x17, #0x4\n"
+ "bge 20f\n"
+ "tbz x17, #1, 18f\n"
+ "str d24, [x14], #0x8\n"
+ "tbz x17, #0, 19f\n"
+ "st1 { v24.s }[2], [x14]\n"
+ "b 19f\n"
+ "18:" // Height 1: Partial direct writeback: partial_1_0
+ "str s24, [x14, #0x0]\n"
+ "19:" // Height 1: Partial direct writeback: Done
+ "b 21f\n"
+ "20:" // Height 1: Full writeback
+ "str q24, [x14, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "21:" // Height 1: Writeback done
+ "subs x17, x17, #0x4\n"
+ "bgt 2b\n"
+ "b 170f\n"
+ "22:" // Height 2
+ "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x15, %x[bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
+ "23:" // Height 2: Column loop
+ "cbz x15, 24f\n"
+ "ldr q24, [x15, #0x0]\n"
+ "add x15, x15, #0x10\n"
+ "mov v25.16b, v24.16b\n"
+ "b 29f\n"
+ "24:" // Height 2: no bias
+ "tbz %x[flags], #0, 28f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x17, #0x4\n"
+ "add x27, x14, x19, LSL #2\n"
+ "bge 27f\n"
+ "tbz x17, #1, 25f\n"
+ "ldr d24, [x14], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x17, #0, 26f\n"
+ "ld1 { v24.s }[2], [x14]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "b 26f\n"
+ "25:" // Height 2: Partial accumulate: partial_1_0
+ "ldr s24, [x14, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s25, [x27, #0x0]\n"
+ "26:" // Height 2: Partial accumulate: Done
+ "sub x14, x14, x19\n"
+ "b 29f\n"
+ "27:" // Height 2: full accumulate
+ "ldr q24, [x14, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "b 29f\n"
+ "28:" // Height 2: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "29:" // Height 2: setup done
+ "mov x13, #0x0\n"
+ "30:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w12, [x20, x13, LSL #0x2]\n"
+ "tbz %x[flags], #3, 31f\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x9, [x20, #0x8]\n"
+ "cbnz x13, 32f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 32f\n"
+ "31:" // Height 2: setup direct input
+ "mov x11, %x[input_ptr]\n"
+ "add x9, x11, x19, LSL #2\n"
+ "32:" // Height 2: input setup done
+ "cmp x12, #0x4\n"
+ "blt 35f\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "cmp x12, #0x8\n"
+ "ldr q8, [x16, #0x0]\n"
+ "blt 34f\n"
+ "33:" // Height 2: Multiply loop: Main loop head
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr d9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr x19, [x16, #0x18]\n"
+ "ldr d10, [x16, #0x20]\n"
+ "add x11, x11, #0x10\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x9, x9, #0x10\n"
+ "mov v9.d[1], x19\n"
+ "ldr d11, [x16, #0x30]\n"
+ "ldr x19, [x16, #0x38]\n"
+ "sub x12, x12, #0x4\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "mov v10.d[1], x21\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "mov v11.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x10, [x11, #0x8]\n"
+ "cmp x12, #0x8\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "ldr x28, [x9, #0x8]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "add x16, x16, #0x40\n"
+ "ldr d8, [x16, #0x0]\n"
+ "ldr x26, [x16, #0x8]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr d0, [x11, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr d1, [x9, #0x0]\n"
+ "mov v8.d[1], x26\n"
+ "mov v0.d[1], x10\n"
+ "mov v1.d[1], x28\n"
+ "bge 33b\n"
+ "34:" // Height 2: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "ldr q11, [x16, #0x30]\n"
+ "sub x12, x12, #0x4\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x16, x16, #0x40\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "35:" // Height 2: Multiply loop: Main loop skip
+ "cbz x12, 37f\n"
+ "36:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x11], #0x4\n"
+ "sub x12, x12, #0x1\n"
+ "ldr s1, [x9], #0x4\n"
+ "ldr q12, [x16, #0x0]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "cbnz x12, 36b\n"
+ "37:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x13, x13, #0x1\n"
+ "cmp x13, x19\n"
+ "bne 30b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x14, #0x0]\n"
+ "add x27, x14, x19, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 38f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "38:" // Height 2: No activation
+ "cmp x17, #0x4\n"
+ "bge 41f\n"
+ "tbz x17, #1, 39f\n"
+ "str d24, [x14], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "tbz x17, #0, 40f\n"
+ "st1 { v24.s }[2], [x14]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "b 40f\n"
+ "39:" // Height 2: Partial direct writeback: partial_1_0
+ "str s24, [x14, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "40:" // Height 2: Partial direct writeback: Done
+ "b 42f\n"
+ "41:" // Height 2: Full writeback
+ "str q24, [x14, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "str q25, [x27, #0x0]\n"
+ "42:" // Height 2: Writeback done
+ "subs x17, x17, #0x4\n"
+ "bgt 23b\n"
+ "b 170f\n"
+ "43:" // Height 3
+ "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x15, %x[bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
+ "44:" // Height 3: Column loop
+ "cbz x15, 45f\n"
+ "ldr q24, [x15, #0x0]\n"
+ "add x15, x15, #0x10\n"
+ "mov v25.16b, v24.16b\n"
+ "mov v26.16b, v24.16b\n"
+ "b 50f\n"
+ "45:" // Height 3: no bias
+ "tbz %x[flags], #0, 49f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x17, #0x4\n"
+ "add x27, x14, x19, LSL #2\n"
+ "add x26, x27, x19, LSL #2\n"
+ "bge 48f\n"
+ "tbz x17, #1, 46f\n"
+ "ldr d24, [x14], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "tbz x17, #0, 47f\n"
+ "ld1 { v24.s }[2], [x14]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "b 47f\n"
+ "46:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s24, [x14, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s25, [x27, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "47:" // Height 3: Partial accumulate: Done
+ "sub x14, x14, x19\n"
+ "b 50f\n"
+ "48:" // Height 3: full accumulate
+ "ldr q24, [x14, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "b 50f\n"
+ "49:" // Height 3: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "50:" // Height 3: setup done
+ "mov x13, #0x0\n"
+ "51:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w12, [x20, x13, LSL #0x2]\n"
+ "tbz %x[flags], #3, 52f\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x9, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "cbnz x13, 53f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 53f\n"
+ "52:" // Height 3: setup direct input
+ "mov x11, %x[input_ptr]\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "53:" // Height 3: input setup done
+ "cmp x12, #0x4\n"
+ "blt 56f\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "cmp x12, #0x8\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q8, [x16, #0x0]\n"
+ "blt 55f\n"
+ "54:" // Height 3: Multiply loop: Main loop head
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr d9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr x19, [x16, #0x18]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr d10, [x16, #0x20]\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x11, x11, #0x10\n"
+ "mov v9.d[1], x19\n"
+ "ldr d11, [x16, #0x30]\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "mov v10.d[1], x21\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "mov v11.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "ldr x10, [x11, #0x8]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr x28, [x9, #0x8]\n"
+ "ldr x26, [x27, #0x8]\n"
+ "sub x12, x12, #0x4\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr d0, [x11, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr d1, [x9, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "mov v0.d[1], x10\n"
+ "cmp x12, #0x8\n"
+ "mov v1.d[1], x28\n"
+ "add x16, x16, #0x40\n"
+ "mov v2.d[1], x26\n"
+ "ldr d8, [x16, #0x0]\n"
+ "ldr x26, [x16, #0x8]\n"
+ "mov v8.d[1], x26\n"
+ "bge 54b\n"
+ "55:" // Height 3: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x16, #0x30]\n"
+ "sub x12, x12, #0x4\n"
+ "add x11, x11, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x27, x27, #0x10\n"
+ "add x16, x16, #0x40\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "56:" // Height 3: Multiply loop: Main loop skip
+ "cbz x12, 58f\n"
+ "57:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x11], #0x4\n"
+ "sub x12, x12, #0x1\n"
+ "ldr s1, [x9], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr q12, [x16, #0x0]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "cbnz x12, 57b\n"
+ "58:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x13, x13, #0x1\n"
+ "cmp x13, x19\n"
+ "bne 51b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x14, #0x0]\n"
+ "add x27, x14, x19, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "tbz %x[flags], #1, 59f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "59:" // Height 3: No activation
+ "cmp x17, #0x4\n"
+ "bge 62f\n"
+ "tbz x17, #1, 60f\n"
+ "str d24, [x14], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "tbz x17, #0, 61f\n"
+ "st1 { v24.s }[2], [x14]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "b 61f\n"
+ "60:" // Height 3: Partial direct writeback: partial_1_0
+ "str s24, [x14, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "61:" // Height 3: Partial direct writeback: Done
+ "b 63f\n"
+ "62:" // Height 3: Full writeback
+ "str q24, [x14, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "63:" // Height 3: Writeback done
+ "subs x17, x17, #0x4\n"
+ "bgt 44b\n"
+ "b 170f\n"
+ "64:" // Height 4
+ "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x15, %x[bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
+ "65:" // Height 4: Column loop
+ "cbz x15, 66f\n"
+ "ldr q24, [x15, #0x0]\n"
+ "add x15, x15, #0x10\n"
+ "mov v25.16b, v24.16b\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "b 71f\n"
+ "66:" // Height 4: no bias
+ "tbz %x[flags], #0, 70f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x17, #0x4\n"
+ "add x27, x14, x19, LSL #2\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "bge 69f\n"
+ "tbz x17, #1, 67f\n"
+ "ldr d24, [x14], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "tbz x17, #0, 68f\n"
+ "ld1 { v24.s }[2], [x14]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
+ "b 68f\n"
+ "67:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s24, [x14, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s25, [x27, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
+ "68:" // Height 4: Partial accumulate: Done
+ "sub x14, x14, x19\n"
+ "b 71f\n"
+ "69:" // Height 4: full accumulate
+ "ldr q24, [x14, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
+ "b 71f\n"
+ "70:" // Height 4: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "71:" // Height 4: setup done
+ "mov x13, #0x0\n"
+ "72:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w12, [x20, x13, LSL #0x2]\n"
+ "tbz %x[flags], #3, 73f\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x9, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "cbnz x13, 74f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 74f\n"
+ "73:" // Height 4: setup direct input
+ "mov x11, %x[input_ptr]\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "74:" // Height 4: input setup done
+ "cmp x12, #0x4\n"
+ "blt 77f\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "cmp x12, #0x8\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x25, #0x0]\n"
+ "ldr q8, [x16, #0x0]\n"
+ "blt 76f\n"
+ "75:" // Height 4: Multiply loop: Main loop head
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr d9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr x19, [x16, #0x18]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr d10, [x16, #0x20]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "ldr x21, [x16, #0x28]\n"
+ "mov v9.d[1], x19\n"
+ "ldr d11, [x16, #0x30]\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "mov v10.d[1], x21\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "mov v11.d[1], x19\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "ldr x10, [x11, #0x8]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr x28, [x9, #0x8]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr d0, [x11, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr d1, [x9, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "mov v0.d[1], x10\n"
+ "ldr x26, [x27, #0x8]\n"
+ "mov v1.d[1], x28\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x12, x12, #0x4\n"
+ "mov v2.d[1], x26\n"
+ "ldr d3, [x25, #0x0]\n"
+ "ldr x19, [x25, #0x8]\n"
+ "cmp x12, #0x8\n"
+ "add x16, x16, #0x40\n"
+ "ldr d8, [x16, #0x0]\n"
+ "mov v3.d[1], x19\n"
+ "ldr x26, [x16, #0x8]\n"
+ "mov v8.d[1], x26\n"
+ "bge 75b\n"
+ "76:" // Height 4: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x16, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "sub x12, x12, #0x4\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "add x16, x16, #0x40\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "77:" // Height 4: Multiply loop: Main loop skip
+ "cbz x12, 79f\n"
+ "78:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x11], #0x4\n"
+ "sub x12, x12, #0x1\n"
+ "ldr s1, [x9], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr s3, [x25], #0x4\n"
+ "ldr q12, [x16, #0x0]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "cbnz x12, 78b\n"
+ "79:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x13, x13, #0x1\n"
+ "cmp x13, x19\n"
+ "bne 72b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x14, #0x0]\n"
+ "add x27, x14, x19, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 80f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "80:" // Height 4: No activation
+ "cmp x17, #0x4\n"
+ "bge 83f\n"
+ "tbz x17, #1, 81f\n"
+ "str d24, [x14], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
+ "tbz x17, #0, 82f\n"
+ "st1 { v24.s }[2], [x14]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
+ "b 82f\n"
+ "81:" // Height 4: Partial direct writeback: partial_1_0
+ "str s24, [x14, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "82:" // Height 4: Partial direct writeback: Done
+ "b 84f\n"
+ "83:" // Height 4: Full writeback
+ "str q24, [x14, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
+ "84:" // Height 4: Writeback done
+ "subs x17, x17, #0x4\n"
+ "bgt 65b\n"
+ "b 170f\n"
+ "85:" // Height 5
+ "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x15, %x[bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
+ "86:" // Height 5: Column loop
+ "cbz x15, 87f\n"
+ "ldr q24, [x15, #0x0]\n"
+ "add x15, x15, #0x10\n"
+ "mov v25.16b, v24.16b\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "b 92f\n"
+ "87:" // Height 5: no bias
+ "tbz %x[flags], #0, 91f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x17, #0x4\n"
+ "add x27, x14, x19, LSL #2\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "bge 90f\n"
+ "tbz x17, #1, 88f\n"
+ "ldr d24, [x14], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x24], #0x8\n"
+ "tbz x17, #0, 89f\n"
+ "ld1 { v24.s }[2], [x14]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
+ "ld1 { v28.s }[2], [x24]\n"
+ "b 89f\n"
+ "88:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s24, [x14, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s25, [x27, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
+ "ldr s28, [x24, #0x0]\n"
+ "89:" // Height 5: Partial accumulate: Done
+ "sub x14, x14, x19\n"
+ "b 92f\n"
+ "90:" // Height 5: full accumulate
+ "ldr q24, [x14, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
+ "ldr q28, [x24, #0x0]\n"
+ "b 92f\n"
+ "91:" // Height 5: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "92:" // Height 5: setup done
+ "mov x13, #0x0\n"
+ "93:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w12, [x20, x13, LSL #0x2]\n"
+ "tbz %x[flags], #3, 94f\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x9, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "cbnz x13, 95f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 95f\n"
+ "94:" // Height 5: setup direct input
+ "mov x11, %x[input_ptr]\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "95:" // Height 5: input setup done
+ "cmp x12, #0x4\n"
+ "blt 98f\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "cmp x12, #0x8\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x25, #0x0]\n"
+ "ldr q4, [x24, #0x0]\n"
+ "ldr q8, [x16, #0x0]\n"
+ "blt 97f\n"
+ "96:" // Height 5: Multiply loop: Main loop head
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr d9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr x19, [x16, #0x18]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr d10, [x16, #0x20]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "ldr x21, [x16, #0x28]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "mov v9.d[1], x19\n"
+ "ldr d11, [x16, #0x30]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "mov v10.d[1], x21\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "ldr x19, [x16, #0x38]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "ldr x10, [x11, #0x8]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "mov v11.d[1], x19\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr x28, [x9, #0x8]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr d0, [x11, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr d1, [x9, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "mov v0.d[1], x10\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "mov v1.d[1], x28\n"
+ "ldr x26, [x27, #0x8]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x12, x12, #0x4\n"
+ "mov v2.d[1], x26\n"
+ "ldr d3, [x25, #0x0]\n"
+ "ldr x19, [x25, #0x8]\n"
+ "cmp x12, #0x8\n"
+ "ldr d4, [x24, #0x0]\n"
+ "add x16, x16, #0x40\n"
+ "ldr x21, [x24, #0x8]\n"
+ "mov v3.d[1], x19\n"
+ "ldr d8, [x16, #0x0]\n"
+ "ldr x26, [x16, #0x8]\n"
+ "mov v4.d[1], x21\n"
+ "mov v8.d[1], x26\n"
+ "bge 96b\n"
+ "97:" // Height 5: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x16, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "sub x12, x12, #0x4\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "add x16, x16, #0x40\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "98:" // Height 5: Multiply loop: Main loop skip
+ "cbz x12, 100f\n"
+ "99:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x11], #0x4\n"
+ "sub x12, x12, #0x1\n"
+ "ldr s1, [x9], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr s3, [x25], #0x4\n"
+ "ldr s4, [x24], #0x4\n"
+ "ldr q12, [x16, #0x0]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "cbnz x12, 99b\n"
+ "100:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x13, x13, #0x1\n"
+ "cmp x13, x19\n"
+ "bne 93b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x14, #0x0]\n"
+ "add x27, x14, x19, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "tbz %x[flags], #1, 101f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "101:" // Height 5: No activation
+ "cmp x17, #0x4\n"
+ "bge 104f\n"
+ "tbz x17, #1, 102f\n"
+ "str d24, [x14], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
+ "str d28, [x24], #0x8\n"
+ "tbz x17, #0, 103f\n"
+ "st1 { v24.s }[2], [x14]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
+ "st1 { v28.s }[2], [x24]\n"
+ "b 103f\n"
+ "102:" // Height 5: Partial direct writeback: partial_1_0
+ "str s24, [x14, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s28, [x24, #0x0]\n"
+ "103:" // Height 5: Partial direct writeback: Done
+ "b 105f\n"
+ "104:" // Height 5: Full writeback
+ "str q24, [x14, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
+ "str q28, [x24, #0x0]\n"
+ "105:" // Height 5: Writeback done
+ "subs x17, x17, #0x4\n"
+ "bgt 86b\n"
+ "b 170f\n"
+ "106:" // Height 6
+ "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x15, %x[bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
+ "107:" // Height 6: Column loop
+ "cbz x15, 108f\n"
+ "ldr q24, [x15, #0x0]\n"
+ "add x15, x15, #0x10\n"
+ "mov v25.16b, v24.16b\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "mov v29.16b, v24.16b\n"
+ "b 113f\n"
+ "108:" // Height 6: no bias
+ "tbz %x[flags], #0, 112f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x17, #0x4\n"
+ "add x27, x14, x19, LSL #2\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "bge 111f\n"
+ "tbz x17, #1, 109f\n"
+ "ldr d24, [x14], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x24], #0x8\n"
+ "ldr d29, [x23], #0x8\n"
+ "tbz x17, #0, 110f\n"
+ "ld1 { v24.s }[2], [x14]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
+ "ld1 { v28.s }[2], [x24]\n"
+ "ld1 { v29.s }[2], [x23]\n"
+ "b 110f\n"
+ "109:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s24, [x14, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s25, [x27, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
+ "ldr s28, [x24, #0x0]\n"
+ "ldr s29, [x23, #0x0]\n"
+ "110:" // Height 6: Partial accumulate: Done
+ "sub x14, x14, x19\n"
+ "b 113f\n"
+ "111:" // Height 6: full accumulate
+ "ldr q24, [x14, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
+ "ldr q28, [x24, #0x0]\n"
+ "ldr q29, [x23, #0x0]\n"
+ "b 113f\n"
+ "112:" // Height 6: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "113:" // Height 6: setup done
+ "mov x13, #0x0\n"
+ "114:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w12, [x20, x13, LSL #0x2]\n"
+ "tbz %x[flags], #3, 115f\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x9, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
+ "cbnz x13, 116f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 116f\n"
+ "115:" // Height 6: setup direct input
+ "mov x11, %x[input_ptr]\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "116:" // Height 6: input setup done
+ "cmp x12, #0x4\n"
+ "blt 119f\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "cmp x12, #0x8\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x25, #0x0]\n"
+ "ldr q4, [x24, #0x0]\n"
+ "ldr q5, [x23, #0x0]\n"
+ "ldr q8, [x16, #0x0]\n"
+ "blt 118f\n"
+ "117:" // Height 6: Multiply loop: Main loop head
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr d9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr x19, [x16, #0x18]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr d10, [x16, #0x20]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "ldr x21, [x16, #0x28]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "mov v9.d[1], x19\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "ldr d11, [x16, #0x30]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "mov v10.d[1], x21\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "ldr x19, [x16, #0x38]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "mov v11.d[1], x19\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "ldr x10, [x11, #0x8]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr x28, [x9, #0x8]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "ldr x26, [x27, #0x8]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr d0, [x11, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr d1, [x9, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "mov v0.d[1], x10\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "mov v1.d[1], x28\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "mov v2.d[1], x26\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x12, x12, #0x4\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "cmp x12, #0x8\n"
+ "ldr d3, [x25, #0x0]\n"
+ "add x16, x16, #0x40\n"
+ "ldr x19, [x25, #0x8]\n"
+ "ldr d4, [x24, #0x0]\n"
+ "ldr x21, [x24, #0x8]\n"
+ "mov v3.d[1], x19\n"
+ "ldr d5, [x23, #0x0]\n"
+ "ldr x19, [x23, #0x8]\n"
+ "mov v4.d[1], x21\n"
+ "ldr d8, [x16, #0x0]\n"
+ "ldr x26, [x16, #0x8]\n"
+ "mov v5.d[1], x19\n"
+ "mov v8.d[1], x26\n"
+ "bge 117b\n"
+ "118:" // Height 6: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x16, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "sub x12, x12, #0x4\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "add x16, x16, #0x40\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "119:" // Height 6: Multiply loop: Main loop skip
+ "cbz x12, 121f\n"
+ "120:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x11], #0x4\n"
+ "sub x12, x12, #0x1\n"
+ "ldr s1, [x9], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr s3, [x25], #0x4\n"
+ "ldr s4, [x24], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr q12, [x16, #0x0]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "cbnz x12, 120b\n"
+ "121:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x13, x13, #0x1\n"
+ "cmp x13, x19\n"
+ "bne 114b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x14, #0x0]\n"
+ "add x27, x14, x19, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 122f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "122:" // Height 6: No activation
+ "cmp x17, #0x4\n"
+ "bge 125f\n"
+ "tbz x17, #1, 123f\n"
+ "str d24, [x14], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
+ "str d28, [x24], #0x8\n"
+ "str d29, [x23], #0x8\n"
+ "tbz x17, #0, 124f\n"
+ "st1 { v24.s }[2], [x14]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
+ "st1 { v28.s }[2], [x24]\n"
+ "st1 { v29.s }[2], [x23]\n"
+ "b 124f\n"
+ "123:" // Height 6: Partial direct writeback: partial_1_0
+ "str s24, [x14, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s28, [x24, #0x0]\n"
+ "str s29, [x23, #0x0]\n"
+ "124:" // Height 6: Partial direct writeback: Done
+ "b 126f\n"
+ "125:" // Height 6: Full writeback
+ "str q24, [x14, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
+ "str q28, [x24, #0x0]\n"
+ "str q29, [x23, #0x0]\n"
+ "126:" // Height 6: Writeback done
+ "subs x17, x17, #0x4\n"
+ "bgt 107b\n"
+ "b 170f\n"
+ "127:" // Height 7
+ "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x15, %x[bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
+ "128:" // Height 7: Column loop
+ "cbz x15, 129f\n"
+ "ldr q24, [x15, #0x0]\n"
+ "add x15, x15, #0x10\n"
+ "mov v25.16b, v24.16b\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "mov v29.16b, v24.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "b 134f\n"
+ "129:" // Height 7: no bias
+ "tbz %x[flags], #0, 133f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x17, #0x4\n"
+ "add x27, x14, x19, LSL #2\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "bge 132f\n"
+ "tbz x17, #1, 130f\n"
+ "ldr d24, [x14], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x24], #0x8\n"
+ "ldr d29, [x23], #0x8\n"
+ "ldr d30, [x22], #0x8\n"
+ "tbz x17, #0, 131f\n"
+ "ld1 { v24.s }[2], [x14]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
+ "ld1 { v28.s }[2], [x24]\n"
+ "ld1 { v29.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x22]\n"
+ "b 131f\n"
+ "130:" // Height 7: Partial accumulate: partial_1_0
+ "ldr s24, [x14, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s25, [x27, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
+ "ldr s28, [x24, #0x0]\n"
+ "ldr s29, [x23, #0x0]\n"
+ "ldr s30, [x22, #0x0]\n"
+ "131:" // Height 7: Partial accumulate: Done
+ "sub x14, x14, x19\n"
+ "b 134f\n"
+ "132:" // Height 7: full accumulate
+ "ldr q24, [x14, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
+ "ldr q28, [x24, #0x0]\n"
+ "ldr q29, [x23, #0x0]\n"
+ "ldr q30, [x22, #0x0]\n"
+ "b 134f\n"
+ "133:" // Height 7: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "134:" // Height 7: setup done
+ "mov x13, #0x0\n"
+ "135:" // Height 7: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w12, [x20, x13, LSL #0x2]\n"
+ "tbz %x[flags], #3, 136f\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x9, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "cbnz x13, 137f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 137f\n"
+ "136:" // Height 7: setup direct input
+ "mov x11, %x[input_ptr]\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "137:" // Height 7: input setup done
+ "cmp x12, #0x4\n"
+ "blt 140f\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "cmp x12, #0x8\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x25, #0x0]\n"
+ "ldr q4, [x24, #0x0]\n"
+ "ldr q5, [x23, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q8, [x16, #0x0]\n"
+ "blt 139f\n"
+ "138:" // Height 7: Multiply loop: Main loop head
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr d9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr x19, [x16, #0x18]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr d10, [x16, #0x20]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "ldr x21, [x16, #0x28]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "mov v9.d[1], x19\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "ldr d11, [x16, #0x30]\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
+ "mov v10.d[1], x21\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "ldr x19, [x16, #0x38]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "mov v11.d[1], x19\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "ldr x10, [x11, #0x8]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "ldr x28, [x9, #0x8]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "ldr x26, [x27, #0x8]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "ldr x19, [x25, #0x8]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr d0, [x11, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr d1, [x9, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "mov v0.d[1], x10\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "mov v1.d[1], x28\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "mov v2.d[1], x26\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "mov v3.d[1], x19\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x12, x12, #0x4\n"
+ "ldr d4, [x24, #0x0]\n"
+ "cmp x12, #0x8\n"
+ "ldr x21, [x24, #0x8]\n"
+ "add x16, x16, #0x40\n"
+ "ldr d8, [x16, #0x0]\n"
+ "ldr x26, [x16, #0x8]\n"
+ "mov v4.d[1], x21\n"
+ "ldr d5, [x23, #0x0]\n"
+ "ldr x19, [x23, #0x8]\n"
+ "mov v8.d[1], x26\n"
+ "ldr d6, [x22, #0x0]\n"
+ "ldr x21, [x22, #0x8]\n"
+ "mov v5.d[1], x19\n"
+ "mov v6.d[1], x21\n"
+ "bge 138b\n"
+ "139:" // Height 7: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x16, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "sub x12, x12, #0x4\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "add x16, x16, #0x40\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "140:" // Height 7: Multiply loop: Main loop skip
+ "cbz x12, 142f\n"
+ "141:" // Height 7: Multiply loop: Odd block loop
+ "ldr s0, [x11], #0x4\n"
+ "sub x12, x12, #0x1\n"
+ "ldr s1, [x9], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr s3, [x25], #0x4\n"
+ "ldr s4, [x24], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s6, [x22], #0x4\n"
+ "ldr q12, [x16, #0x0]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "fmla v30.4s, v12.4s, v6.s[0]\n"
+ "cbnz x12, 141b\n"
+ "142:" // Height 7: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x13, x13, #0x1\n"
+ "cmp x13, x19\n"
+ "bne 135b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x14, #0x0]\n"
+ "add x27, x14, x19, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "tbz %x[flags], #1, 143f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "143:" // Height 7: No activation
+ "cmp x17, #0x4\n"
+ "bge 146f\n"
+ "tbz x17, #1, 144f\n"
+ "str d24, [x14], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
+ "str d28, [x24], #0x8\n"
+ "str d29, [x23], #0x8\n"
+ "str d30, [x22], #0x8\n"
+ "tbz x17, #0, 145f\n"
+ "st1 { v24.s }[2], [x14]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
+ "st1 { v28.s }[2], [x24]\n"
+ "st1 { v29.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x22]\n"
+ "b 145f\n"
+ "144:" // Height 7: Partial direct writeback: partial_1_0
+ "str s24, [x14, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s28, [x24, #0x0]\n"
+ "str s29, [x23, #0x0]\n"
+ "str s30, [x22, #0x0]\n"
+ "145:" // Height 7: Partial direct writeback: Done
+ "b 147f\n"
+ "146:" // Height 7: Full writeback
+ "str q24, [x14, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
+ "str q28, [x24, #0x0]\n"
+ "str q29, [x23, #0x0]\n"
+ "str q30, [x22, #0x0]\n"
+ "147:" // Height 7: Writeback done
+ "subs x17, x17, #0x4\n"
+ "bgt 128b\n"
+ "b 170f\n"
+ "148:" // Height 8
+ "ldr x17, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x15, %x[bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0x20\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "149:" // Height 8: Column loop
+ "cbz x15, 150f\n"
+ "ldr q24, [x15, #0x0]\n"
+ "add x15, x15, #0x10\n"
+ "mov v25.16b, v24.16b\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "mov v29.16b, v24.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "mov v31.16b, v24.16b\n"
+ "b 155f\n"
+ "150:" // Height 8: no bias
+ "tbz %x[flags], #0, 154f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x17, #0x4\n"
+ "add x27, x14, x19, LSL #2\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "bge 153f\n"
+ "tbz x17, #1, 151f\n"
+ "ldr d24, [x14], #0x8\n"
+ "ldr d25, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d26, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d28, [x24], #0x8\n"
+ "ldr d29, [x23], #0x8\n"
+ "ldr d30, [x22], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x17, #0, 152f\n"
+ "ld1 { v24.s }[2], [x14]\n"
+ "ld1 { v25.s }[2], [x27]\n"
+ "ld1 { v26.s }[2], [x26]\n"
+ "ld1 { v27.s }[2], [x25]\n"
+ "ld1 { v28.s }[2], [x24]\n"
+ "ld1 { v29.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x22]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 152f\n"
+ "151:" // Height 8: Partial accumulate: partial_1_0
+ "ldr s24, [x14, #0x0]\n"
+ "mov x19, #0x0\n"
+ "ldr s25, [x27, #0x0]\n"
+ "ldr s26, [x26, #0x0]\n"
+ "ldr s27, [x25, #0x0]\n"
+ "ldr s28, [x24, #0x0]\n"
+ "ldr s29, [x23, #0x0]\n"
+ "ldr s30, [x22, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "152:" // Height 8: Partial accumulate: Done
+ "sub x14, x14, x19\n"
+ "b 155f\n"
+ "153:" // Height 8: full accumulate
+ "ldr q24, [x14, #0x0]\n"
+ "ldr q25, [x27, #0x0]\n"
+ "ldr q26, [x26, #0x0]\n"
+ "ldr q27, [x25, #0x0]\n"
+ "ldr q28, [x24, #0x0]\n"
+ "ldr q29, [x23, #0x0]\n"
+ "ldr q30, [x22, #0x0]\n"
+ "ldr q31, [x21, #0x0]\n"
+ "b 155f\n"
+ "154:" // Height 8: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "155:" // Height 8: setup done
+ "mov x13, #0x0\n"
+ "156:" // Height 8: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w12, [x20, x13, LSL #0x2]\n"
+ "tbz %x[flags], #3, 157f\n"
+ "ldr x20, [%x[input_ptr], x13, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x11, [x20, #0x0]\n"
+ "ldr x9, [x20, #0x8]\n"
+ "ldr x27, [x20, #0x10]\n"
+ "ldr x25, [x20, #0x18]\n"
+ "ldr x24, [x20, #0x20]\n"
+ "ldr x23, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "ldr x20, [x20, #0x38]\n"
+ "cbnz x13, 158f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 158f\n"
+ "157:" // Height 8: setup direct input
+ "mov x11, %x[input_ptr]\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "158:" // Height 8: input setup done
+ "cmp x12, #0x4\n"
+ "blt 161f\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q1, [x9, #0x0]\n"
+ "cmp x12, #0x8\n"
+ "ldr q2, [x27, #0x0]\n"
+ "ldr q3, [x25, #0x0]\n"
+ "ldr q4, [x24, #0x0]\n"
+ "ldr q5, [x23, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q8, [x16, #0x0]\n"
+ "blt 160f\n"
+ "159:" // Height 8: Multiply loop: Main loop head
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr d9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr x19, [x16, #0x18]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr d10, [x16, #0x20]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "ldr x21, [x16, #0x28]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "mov v9.d[1], x19\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "ldr d11, [x16, #0x30]\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
+ "mov v10.d[1], x21\n"
+ "fmla v31.4s, v8.4s, v7.s[0]\n"
+ "ldr x19, [x16, #0x38]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "mov v11.d[1], x19\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "ldr x10, [x11, #0x8]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "ldr x28, [x9, #0x8]\n"
+ "fmla v31.4s, v9.4s, v7.s[1]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "ldr x26, [x27, #0x8]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "ldr x19, [x25, #0x8]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v31.4s, v10.4s, v7.s[2]\n"
+ "ldr x21, [x24, #0x8]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr d0, [x11, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr d1, [x9, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr d2, [x27, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "mov v0.d[1], x10\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "mov v1.d[1], x28\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "mov v2.d[1], x26\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "ldr d3, [x25, #0x0]\n"
+ "fmla v31.4s, v11.4s, v7.s[3]\n"
+ "ldr d4, [x24, #0x0]\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "mov v3.d[1], x19\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "mov v4.d[1], x21\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr d5, [x23, #0x0]\n"
+ "add x20, x20, #0x10\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x12, x12, #0x4\n"
+ "ldr x19, [x23, #0x8]\n"
+ "cmp x12, #0x8\n"
+ "ldr d6, [x22, #0x0]\n"
+ "add x16, x16, #0x40\n"
+ "ldr d8, [x16, #0x0]\n"
+ "mov v5.d[1], x19\n"
+ "ldr x26, [x16, #0x8]\n"
+ "ldr x21, [x22, #0x8]\n"
+ "ldr d7, [x20, #0x0]\n"
+ "mov v8.d[1], x26\n"
+ "ldr x19, [x20, #0x8]\n"
+ "mov v6.d[1], x21\n"
+ "mov v7.d[1], x19\n"
+ "bge 159b\n"
+ "160:" // Height 8: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x16, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x16, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x16, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "sub x12, x12, #0x4\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x11, #0x80]\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v31.4s, v8.4s, v7.s[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v31.4s, v9.4s, v7.s[1]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "add x16, x16, #0x40\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "fmla v31.4s, v10.4s, v7.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "fmla v31.4s, v11.4s, v7.s[3]\n"
+ "161:" // Height 8: Multiply loop: Main loop skip
+ "cbz x12, 163f\n"
+ "162:" // Height 8: Multiply loop: Odd block loop
+ "ldr s0, [x11], #0x4\n"
+ "sub x12, x12, #0x1\n"
+ "ldr s1, [x9], #0x4\n"
+ "ldr s2, [x27], #0x4\n"
+ "ldr s3, [x25], #0x4\n"
+ "ldr s4, [x24], #0x4\n"
+ "ldr s5, [x23], #0x4\n"
+ "ldr s6, [x22], #0x4\n"
+ "ldr s7, [x20], #0x4\n"
+ "ldr q12, [x16, #0x0]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "fmla v30.4s, v12.4s, v6.s[0]\n"
+ "fmla v31.4s, v12.4s, v7.s[0]\n"
+ "cbnz x12, 162b\n"
+ "163:" // Height 8: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x13, x13, #0x1\n"
+ "cmp x13, x19\n"
+ "bne 156b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x14, #0x0]\n"
+ "add x27, x14, x19, LSL #2\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 164f\n"
+ "add x20, %x[args_ptr], %[offset_min]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmin v31.4s, v31.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "fmax v31.4s, v31.4s, v17.4s\n"
+ "164:" // Height 8: No activation
+ "cmp x17, #0x4\n"
+ "bge 167f\n"
+ "tbz x17, #1, 165f\n"
+ "str d24, [x14], #0x8\n"
+ "str d25, [x27], #0x8\n"
+ "str d26, [x26], #0x8\n"
+ "str d27, [x25], #0x8\n"
+ "str d28, [x24], #0x8\n"
+ "str d29, [x23], #0x8\n"
+ "str d30, [x22], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x17, #0, 166f\n"
+ "st1 { v24.s }[2], [x14]\n"
+ "st1 { v25.s }[2], [x27]\n"
+ "st1 { v26.s }[2], [x26]\n"
+ "st1 { v27.s }[2], [x25]\n"
+ "st1 { v28.s }[2], [x24]\n"
+ "st1 { v29.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x22]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 166f\n"
+ "165:" // Height 8: Partial direct writeback: partial_1_0
+ "str s24, [x14, #0x0]\n"
+ "str s25, [x27, #0x0]\n"
+ "str s26, [x26, #0x0]\n"
+ "str s27, [x25, #0x0]\n"
+ "str s28, [x24, #0x0]\n"
+ "str s29, [x23, #0x0]\n"
+ "str s30, [x22, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "166:" // Height 8: Partial direct writeback: Done
+ "b 168f\n"
+ "167:" // Height 8: Full writeback
+ "str q24, [x14, #0x0]\n"
+ "add x14, x14, #0x10\n"
+ "str q25, [x27, #0x0]\n"
+ "str q26, [x26, #0x0]\n"
+ "str q27, [x25, #0x0]\n"
+ "str q28, [x24, #0x0]\n"
+ "str q29, [x23, #0x0]\n"
+ "str q30, [x22, #0x0]\n"
+ "str q31, [x21, #0x0]\n"
+ "168:" // Height 8: Writeback done
+ "subs x17, x17, #0x4\n"
+ "bgt 149b\n"
+ "subs %x[M], %x[M], #0x8\n"
+ "beq 170f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 169f\n"
+ "add x20, x20, #0x8\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "169:" // Update direct input
+ "mov x19, #0x20\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "170:" // Exit
+
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
index 7f3fc898f5..9bed0213da 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
@@ -95,278 +95,266 @@ void a64_hybrid_fp32_mla_8x4 (
"1:" // Row loop
"cmp %x[M], #0x8\n"
- "bge 155f\n"
+ "bge 148f\n"
"cmp %x[M], #0x6\n"
- "bgt 133f\n"
- "beq 111f\n"
+ "bgt 127f\n"
+ "beq 106f\n"
"cmp %x[M], #0x4\n"
- "bgt 89f\n"
- "beq 67f\n"
+ "bgt 85f\n"
+ "beq 64f\n"
"cmp %x[M], #0x2\n"
- "bgt 45f\n"
- "beq 23f\n"
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x8, %x[bias]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x17, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
- "cbz x8, 4f\n"
- "ldr q24, [x8, #0x0]\n"
- "add x8, x8, #0x10\n"
- "b 9f\n"
- "4:" // Height 1: no bias
- "tbz %x[flags], #0, 8f\n"
- "cmp x6, #0x4\n"
- "bge 7f\n"
- "tbz x6, #1, 5f\n"
- "ldr d24, [x17], #0x8\n"
+ "bgt 43f\n"
+ "beq 22f\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x11, %x[bias]\n"
+ "mov x10, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "cbz x11, 3f\n"
+ "ldr q24, [x11, #0x0]\n"
+ "add x11, x11, #0x10\n"
+ "b 8f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 7f\n"
+ "cmp x13, #0x4\n"
+ "bge 6f\n"
+ "tbz x13, #1, 4f\n"
+ "ldr d24, [x10], #0x8\n"
"mov x19, #0x8\n"
- "tbz x6, #0, 6f\n"
- "ld1 { v24.s }[2], [x17]\n"
- "b 6f\n"
- "5:" // Height 1: Partial accumulate: partial_1_0
+ "tbz x13, #0, 5f\n"
+ "ld1 { v24.s }[2], [x10]\n"
+ "b 5f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_0
+ "ldr s24, [x10, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s24, [x17, #0x0]\n"
- "6:" // Height 1: Partial accumulate: Done
- "sub x17, x17, x19\n"
- "b 9f\n"
- "7:" // Height 1: full accumulate
- "ldr q24, [x17, #0x0]\n"
- "b 9f\n"
- "8:" // Height 1: no accumulate
+ "5:" // Height 1: Partial accumulate: Done
+ "sub x10, x10, x19\n"
+ "b 8f\n"
+ "6:" // Height 1: full accumulate
+ "ldr q24, [x10, #0x0]\n"
+ "b 8f\n"
+ "7:" // Height 1: no accumulate
"movi v24.16b, #0x0\n"
- "9:" // Height 1: setup done
- "mov x16, #0x0\n"
- "10:" // Height 1: String loop
+ "8:" // Height 1: setup done
+ "mov x9, #0x0\n"
+ "9:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 11f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 10f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "cbnz x16, 12f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "cbnz x9, 11f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "b 12f\n"
- "11:" // Height 1: setup direct input
- "mov x14, %x[input_ptr]\n"
- "12:" // Height 1: input setup done
- "cmp x15, #0x4\n"
- "blt 15f\n"
- "cmp x15, #0x8\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 11f\n"
+ "10:" // Height 1: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "11:" // Height 1: input setup done
+ "cmp x28, #0x4\n"
"blt 14f\n"
- "13:" // Height 1: Multiply loop: Main loop head
- "ldr q0, [x14, #0x0]\n"
- "ldr q8, [x7, #0x0]\n"
+ "ldr q0, [x27, #0x0]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "cmp x28, #0x8\n"
+ "blt 13f\n"
+ "12:" // Height 1: Multiply loop: Main loop head
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "sub x28, x28, #0x4\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "cmp x28, #0x8\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q0, [x27, #0x0]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "bge 12b\n"
+ "13:" // Height 1: Multiply loop: Single iteration only
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x7, #0x10]\n"
- "ldr q10, [x7, #0x20]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "sub x28, x28, #0x4\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "ldr q11, [x7, #0x30]\n"
- "add x14, x14, #0x10\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x27, x27, #0x10\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "sub x15, x15, #0x4\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
- "cmp x15, #0x8\n"
- "add x7, x7, #0x40\n"
- "bge 13b\n"
- "14:" // Height 1: Multiply loop: Single iteration only
- "sub x15, x15, #0x4\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q12, [x7, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "14:" // Height 1: Multiply loop: Main loop skip
+ "cbz x28, 16f\n"
+ "15:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x27], #0x4\n"
+ "sub x28, x28, #0x1\n"
+ "ldr q12, [x12, #0x0]\n"
"fmla v24.4s, v12.4s, v0.s[0]\n"
- "ldr q13, [x7, #0x10]\n"
- "ldr q14, [x7, #0x20]\n"
- "fmla v24.4s, v13.4s, v0.s[1]\n"
- "ldr q15, [x7, #0x30]\n"
- "add x14, x14, #0x10\n"
- "fmla v24.4s, v14.4s, v0.s[2]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x7, x7, #0x40\n"
- "fmla v24.4s, v15.4s, v0.s[3]\n"
- "15:" // Height 1: Multiply loop: Main loop skip
- "cbz x15, 17f\n"
- "16:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x14], #0x4\n"
- "ldr q16, [x7, #0x0]\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "sub x15, x15, #0x1\n"
- "add x7, x7, #0x10\n"
- "cbnz x15, 16b\n"
- "17:" // Height 1: Multiply loop: No odd multiplies
+ "add x12, x12, #0x10\n"
+ "cbnz x28, 15b\n"
+ "16:" // Height 1: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x16, x16, #0x1\n"
- "cmp x16, x19\n"
- "bne 10b\n"
- "prfm pstl1keep, [x17, #0x0]\n"
- "tbz %x[flags], #1, 18f\n"
+ "add x9, x9, #0x1\n"
+ "cmp x9, x19\n"
+ "bne 9b\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "tbz %x[flags], #1, 17f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v17.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
"ld1r { v16.4s }, [x19]\n"
"fmin v24.4s, v24.4s, v16.4s\n"
"fmax v24.4s, v24.4s, v17.4s\n"
- "18:" // Height 1: No activation
- "cmp x6, #0x4\n"
- "bge 21f\n"
- "tbz x6, #1, 19f\n"
- "str d24, [x17], #0x8\n"
- "tbz x6, #0, 20f\n"
- "st1 { v24.s }[2], [x17]\n"
- "b 20f\n"
- "19:" // Height 1: Partial direct writeback: partial_1_0
- "str s24, [x17, #0x0]\n"
- "20:" // Height 1: Partial direct writeback: Done
- "b 22f\n"
- "21:" // Height 1: Full writeback
- "str q24, [x17, #0x0]\n"
- "add x17, x17, #0x10\n"
- "22:" // Height 1: Writeback done
- "subs x6, x6, #0x4\n"
- "bgt 3b\n"
- "b 178f\n"
- "23:" // Height 2
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 24f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 25f\n"
- "24:" // Height 2: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "25:" // Height 2: Column loop
- "cbz x8, 26f\n"
- "ldr q24, [x8, #0x0]\n"
+ "17:" // Height 1: No activation
+ "cmp x13, #0x4\n"
+ "bge 20f\n"
+ "tbz x13, #1, 18f\n"
+ "str d24, [x10], #0x8\n"
+ "tbz x13, #0, 19f\n"
+ "st1 { v24.s }[2], [x10]\n"
+ "b 19f\n"
+ "18:" // Height 1: Partial direct writeback: partial_1_0
+ "str s24, [x10, #0x0]\n"
+ "19:" // Height 1: Partial direct writeback: Done
+ "b 21f\n"
+ "20:" // Height 1: Full writeback
+ "str q24, [x10, #0x0]\n"
+ "add x10, x10, #0x10\n"
+ "21:" // Height 1: Writeback done
+ "subs x13, x13, #0x4\n"
+ "bgt 2b\n"
+ "b 170f\n"
+ "22:" // Height 2
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "23:" // Height 2: Column loop
+ "cbz x11, 24f\n"
+ "ldr q24, [x11, #0x0]\n"
"mov v25.16b, v24.16b\n"
- "add x8, x8, #0x10\n"
- "b 31f\n"
- "26:" // Height 2: no bias
- "tbz %x[flags], #0, 30f\n"
- "cmp x6, #0x4\n"
- "bge 29f\n"
- "tbz x6, #1, 27f\n"
- "ldr d24, [x17], #0x8\n"
- "ldr d25, [x13], #0x8\n"
+ "add x11, x11, #0x10\n"
+ "b 29f\n"
+ "24:" // Height 2: no bias
+ "tbz %x[flags], #0, 28f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x13, #0x4\n"
+ "add x26, x10, x19, LSL #2\n"
+ "bge 27f\n"
+ "tbz x13, #1, 25f\n"
+ "ldr d24, [x10], #0x8\n"
+ "ldr d25, [x26], #0x8\n"
"mov x19, #0x8\n"
- "tbz x6, #0, 28f\n"
- "ld1 { v24.s }[2], [x17]\n"
- "ld1 { v25.s }[2], [x13]\n"
- "b 28f\n"
- "27:" // Height 2: Partial accumulate: partial_1_0
+ "tbz x13, #0, 26f\n"
+ "ld1 { v24.s }[2], [x10]\n"
+ "ld1 { v25.s }[2], [x26]\n"
+ "b 26f\n"
+ "25:" // Height 2: Partial accumulate: partial_1_0
+ "ldr s24, [x10, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s24, [x17, #0x0]\n"
- "ldr s25, [x13, #0x0]\n"
- "28:" // Height 2: Partial accumulate: Done
- "sub x17, x17, x19\n"
- "sub x13, x13, x19\n"
- "b 31f\n"
- "29:" // Height 2: full accumulate
- "ldr q24, [x17, #0x0]\n"
- "ldr q25, [x13, #0x0]\n"
- "b 31f\n"
- "30:" // Height 2: no accumulate
+ "ldr s25, [x26, #0x0]\n"
+ "26:" // Height 2: Partial accumulate: Done
+ "sub x10, x10, x19\n"
+ "b 29f\n"
+ "27:" // Height 2: full accumulate
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x26, #0x0]\n"
+ "b 29f\n"
+ "28:" // Height 2: no accumulate
"movi v24.16b, #0x0\n"
"movi v25.16b, #0x0\n"
- "31:" // Height 2: setup done
- "mov x16, #0x0\n"
- "32:" // Height 2: String loop
+ "29:" // Height 2: setup done
+ "mov x9, #0x0\n"
+ "30:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 33f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 31f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "cbnz x16, 34f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "cbnz x9, 32f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "b 34f\n"
- "33:" // Height 2: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "34:" // Height 2: input setup done
- "cmp x15, #0x4\n"
- "blt 37f\n"
- "cmp x15, #0x8\n"
- "blt 36f\n"
- "35:" // Height 2: Multiply loop: Main loop head
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q8, [x7, #0x0]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 32f\n"
+ "31:" // Height 2: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "32:" // Height 2: input setup done
+ "cmp x28, #0x4\n"
+ "blt 35f\n"
+ "ldr q0, [x27, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x28, #0x8\n"
+ "ldr q8, [x12, #0x0]\n"
+ "blt 34f\n"
+ "33:" // Height 2: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x7, #0x10]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "add x27, x27, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x7, #0x20]\n"
- "ldr q11, [x7, #0x30]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x26, x26, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "sub x28, x28, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "cmp x28, #0x8\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [x27, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "bge 33b\n"
+ "34:" // Height 2: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "sub x28, x28, #0x4\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "sub x15, x15, #0x4\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
- "cmp x15, #0x8\n"
- "add x7, x7, #0x40\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
- "bge 35b\n"
- "36:" // Height 2: Multiply loop: Single iteration only
- "sub x15, x15, #0x4\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q12, [x7, #0x0]\n"
+ "35:" // Height 2: Multiply loop: Main loop skip
+ "cbz x28, 37f\n"
+ "36:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x27], #0x4\n"
+ "sub x28, x28, #0x1\n"
+ "ldr s1, [x26], #0x4\n"
+ "ldr q12, [x12, #0x0]\n"
"fmla v24.4s, v12.4s, v0.s[0]\n"
- "ldr q13, [x7, #0x10]\n"
- "fmla v25.4s, v12.4s, v1.s[0]\n"
- "ldr q14, [x7, #0x20]\n"
- "ldr q15, [x7, #0x30]\n"
- "fmla v24.4s, v13.4s, v0.s[1]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "fmla v25.4s, v13.4s, v1.s[1]\n"
"add x12, x12, #0x10\n"
- "fmla v24.4s, v14.4s, v0.s[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x7, x7, #0x40\n"
- "fmla v25.4s, v14.4s, v1.s[2]\n"
- "fmla v24.4s, v15.4s, v0.s[3]\n"
- "fmla v25.4s, v15.4s, v1.s[3]\n"
- "37:" // Height 2: Multiply loop: Main loop skip
- "cbz x15, 39f\n"
- "38:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x14], #0x4\n"
- "ldr s1, [x12], #0x4\n"
- "ldr q16, [x7, #0x0]\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "sub x15, x15, #0x1\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add x7, x7, #0x10\n"
- "cbnz x15, 38b\n"
- "39:" // Height 2: Multiply loop: No odd multiplies
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "cbnz x28, 36b\n"
+ "37:" // Height 2: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x16, x16, #0x1\n"
- "cmp x16, x19\n"
- "bne 32b\n"
- "prfm pstl1keep, [x17, #0x0]\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "tbz %x[flags], #1, 40f\n"
+ "add x9, x9, #0x1\n"
+ "cmp x9, x19\n"
+ "bne 30b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "tbz %x[flags], #1, 38f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v17.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -375,194 +363,185 @@ void a64_hybrid_fp32_mla_8x4 (
"fmin v25.4s, v25.4s, v16.4s\n"
"fmax v24.4s, v24.4s, v17.4s\n"
"fmax v25.4s, v25.4s, v17.4s\n"
- "40:" // Height 2: No activation
- "cmp x6, #0x4\n"
- "bge 43f\n"
- "tbz x6, #1, 41f\n"
- "str d24, [x17], #0x8\n"
- "str d25, [x13], #0x8\n"
- "tbz x6, #0, 42f\n"
- "st1 { v24.s }[2], [x17]\n"
- "st1 { v25.s }[2], [x13]\n"
+ "38:" // Height 2: No activation
+ "cmp x13, #0x4\n"
+ "bge 41f\n"
+ "tbz x13, #1, 39f\n"
+ "str d24, [x10], #0x8\n"
+ "str d25, [x26], #0x8\n"
+ "tbz x13, #0, 40f\n"
+ "st1 { v24.s }[2], [x10]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "b 40f\n"
+ "39:" // Height 2: Partial direct writeback: partial_1_0
+ "str s24, [x10, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "40:" // Height 2: Partial direct writeback: Done
"b 42f\n"
- "41:" // Height 2: Partial direct writeback: partial_1_0
- "str s24, [x17, #0x0]\n"
- "str s25, [x13, #0x0]\n"
- "42:" // Height 2: Partial direct writeback: Done
- "b 44f\n"
- "43:" // Height 2: Full writeback
- "str q24, [x17, #0x0]\n"
- "str q25, [x13, #0x0]\n"
- "add x17, x17, #0x10\n"
- "add x13, x13, #0x10\n"
- "44:" // Height 2: Writeback done
- "subs x6, x6, #0x4\n"
- "bgt 25b\n"
- "b 178f\n"
- "45:" // Height 3
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 46f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "add x11, x11, x19, LSL #2\n"
- "b 47f\n"
- "46:" // Height 3: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "47:" // Height 3: Column loop
- "cbz x8, 48f\n"
- "ldr q24, [x8, #0x0]\n"
+ "41:" // Height 2: Full writeback
+ "str q24, [x10, #0x0]\n"
+ "add x10, x10, #0x10\n"
+ "str q25, [x26, #0x0]\n"
+ "42:" // Height 2: Writeback done
+ "subs x13, x13, #0x4\n"
+ "bgt 23b\n"
+ "b 170f\n"
+ "43:" // Height 3
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "44:" // Height 3: Column loop
+ "cbz x11, 45f\n"
+ "ldr q24, [x11, #0x0]\n"
"mov v25.16b, v24.16b\n"
- "add x8, x8, #0x10\n"
+ "add x11, x11, #0x10\n"
"mov v26.16b, v24.16b\n"
- "b 53f\n"
- "48:" // Height 3: no bias
- "tbz %x[flags], #0, 52f\n"
- "cmp x6, #0x4\n"
- "bge 51f\n"
- "tbz x6, #1, 49f\n"
- "ldr d24, [x17], #0x8\n"
- "ldr d25, [x13], #0x8\n"
- "ldr d26, [x11], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x6, #0, 50f\n"
- "ld1 { v24.s }[2], [x17]\n"
- "ld1 { v25.s }[2], [x13]\n"
- "ld1 { v26.s }[2], [x11]\n"
"b 50f\n"
- "49:" // Height 3: Partial accumulate: partial_1_0
+ "45:" // Height 3: no bias
+ "tbz %x[flags], #0, 49f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x13, #0x4\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "bge 48f\n"
+ "tbz x13, #1, 46f\n"
+ "ldr d24, [x10], #0x8\n"
+ "ldr d25, [x26], #0x8\n"
+ "mov x19, #0x8\n"
+ "ldr d26, [x25], #0x8\n"
+ "tbz x13, #0, 47f\n"
+ "ld1 { v24.s }[2], [x10]\n"
+ "ld1 { v25.s }[2], [x26]\n"
+ "ld1 { v26.s }[2], [x25]\n"
+ "b 47f\n"
+ "46:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s24, [x10, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s24, [x17, #0x0]\n"
- "ldr s25, [x13, #0x0]\n"
- "ldr s26, [x11, #0x0]\n"
- "50:" // Height 3: Partial accumulate: Done
- "sub x17, x17, x19\n"
- "sub x13, x13, x19\n"
- "sub x11, x11, x19\n"
- "b 53f\n"
- "51:" // Height 3: full accumulate
- "ldr q24, [x17, #0x0]\n"
- "ldr q25, [x13, #0x0]\n"
- "ldr q26, [x11, #0x0]\n"
- "b 53f\n"
- "52:" // Height 3: no accumulate
+ "ldr s25, [x26, #0x0]\n"
+ "ldr s26, [x25, #0x0]\n"
+ "47:" // Height 3: Partial accumulate: Done
+ "sub x10, x10, x19\n"
+ "b 50f\n"
+ "48:" // Height 3: full accumulate
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x26, #0x0]\n"
+ "ldr q26, [x25, #0x0]\n"
+ "b 50f\n"
+ "49:" // Height 3: no accumulate
"movi v24.16b, #0x0\n"
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
- "53:" // Height 3: setup done
- "mov x16, #0x0\n"
- "54:" // Height 3: String loop
+ "50:" // Height 3: setup done
+ "mov x9, #0x0\n"
+ "51:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 55f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 52f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "cbnz x16, 56f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "cbnz x9, 53f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "b 56f\n"
- "55:" // Height 3: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "56:" // Height 3: input setup done
- "cmp x15, #0x4\n"
- "blt 59f\n"
- "cmp x15, #0x8\n"
- "blt 58f\n"
- "57:" // Height 3: Multiply loop: Main loop head
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q8, [x7, #0x0]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 53f\n"
+ "52:" // Height 3: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "53:" // Height 3: input setup done
+ "cmp x28, #0x4\n"
+ "blt 56f\n"
+ "ldr q0, [x27, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x28, #0x8\n"
+ "ldr q2, [x25, #0x0]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "blt 55f\n"
+ "54:" // Height 3: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x7, #0x10]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "add x27, x27, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x7, #0x20]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x26, x26, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x7, #0x30]\n"
- "add x14, x14, #0x10\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x25, x25, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "sub x28, x28, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "cmp x28, #0x8\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "sub x15, x15, #0x4\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "cmp x15, #0x8\n"
- "add x7, x7, #0x40\n"
+ "ldr q8, [x12, #0x0]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [x27, #0x0]\n"
"fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr q1, [x26, #0x0]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
- "bge 57b\n"
- "58:" // Height 3: Multiply loop: Single iteration only
- "sub x15, x15, #0x4\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q12, [x7, #0x0]\n"
+ "ldr q2, [x25, #0x0]\n"
+ "bge 54b\n"
+ "55:" // Height 3: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "sub x28, x28, #0x4\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "56:" // Height 3: Multiply loop: Main loop skip
+ "cbz x28, 58f\n"
+ "57:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x27], #0x4\n"
+ "sub x28, x28, #0x1\n"
+ "ldr s1, [x26], #0x4\n"
+ "ldr s2, [x25], #0x4\n"
+ "ldr q12, [x12, #0x0]\n"
"fmla v24.4s, v12.4s, v0.s[0]\n"
- "ldr q13, [x7, #0x10]\n"
+ "add x12, x12, #0x10\n"
"fmla v25.4s, v12.4s, v1.s[0]\n"
- "ldr q14, [x7, #0x20]\n"
"fmla v26.4s, v12.4s, v2.s[0]\n"
- "ldr q15, [x7, #0x30]\n"
- "add x14, x14, #0x10\n"
- "fmla v24.4s, v13.4s, v0.s[1]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x12, x12, #0x10\n"
- "fmla v25.4s, v13.4s, v1.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
- "fmla v26.4s, v13.4s, v2.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x7, x7, #0x40\n"
- "fmla v24.4s, v14.4s, v0.s[2]\n"
- "fmla v25.4s, v14.4s, v1.s[2]\n"
- "fmla v26.4s, v14.4s, v2.s[2]\n"
- "fmla v24.4s, v15.4s, v0.s[3]\n"
- "fmla v25.4s, v15.4s, v1.s[3]\n"
- "fmla v26.4s, v15.4s, v2.s[3]\n"
- "59:" // Height 3: Multiply loop: Main loop skip
- "cbz x15, 61f\n"
- "60:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x14], #0x4\n"
- "ldr s1, [x12], #0x4\n"
- "ldr s2, [x10], #0x4\n"
- "ldr q16, [x7, #0x0]\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "sub x15, x15, #0x1\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add x7, x7, #0x10\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "cbnz x15, 60b\n"
- "61:" // Height 3: Multiply loop: No odd multiplies
+ "cbnz x28, 57b\n"
+ "58:" // Height 3: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x16, x16, #0x1\n"
- "cmp x16, x19\n"
- "bne 54b\n"
- "prfm pstl1keep, [x17, #0x0]\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x11, #0x0]\n"
- "tbz %x[flags], #1, 62f\n"
+ "add x9, x9, #0x1\n"
+ "cmp x9, x19\n"
+ "bne 51b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 59f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v17.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -573,161 +552,181 @@ void a64_hybrid_fp32_mla_8x4 (
"fmax v24.4s, v24.4s, v17.4s\n"
"fmax v25.4s, v25.4s, v17.4s\n"
"fmax v26.4s, v26.4s, v17.4s\n"
- "62:" // Height 3: No activation
- "cmp x6, #0x4\n"
- "bge 65f\n"
- "tbz x6, #1, 63f\n"
- "str d24, [x17], #0x8\n"
- "str d25, [x13], #0x8\n"
- "str d26, [x11], #0x8\n"
- "tbz x6, #0, 64f\n"
- "st1 { v24.s }[2], [x17]\n"
- "st1 { v25.s }[2], [x13]\n"
- "st1 { v26.s }[2], [x11]\n"
- "b 64f\n"
- "63:" // Height 3: Partial direct writeback: partial_1_0
- "str s24, [x17, #0x0]\n"
- "str s25, [x13, #0x0]\n"
- "str s26, [x11, #0x0]\n"
- "64:" // Height 3: Partial direct writeback: Done
- "b 66f\n"
- "65:" // Height 3: Full writeback
- "str q24, [x17, #0x0]\n"
- "str q25, [x13, #0x0]\n"
- "str q26, [x11, #0x0]\n"
- "add x17, x17, #0x10\n"
- "add x13, x13, #0x10\n"
- "add x11, x11, #0x10\n"
- "66:" // Height 3: Writeback done
- "subs x6, x6, #0x4\n"
- "bgt 47b\n"
- "b 178f\n"
- "67:" // Height 4
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 68f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "add x11, x11, x19, LSL #2\n"
- "add x9, x9, x19, LSL #2\n"
- "b 69f\n"
- "68:" // Height 4: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "69:" // Height 4: Column loop
- "cbz x8, 70f\n"
- "ldr q24, [x8, #0x0]\n"
+ "59:" // Height 3: No activation
+ "cmp x13, #0x4\n"
+ "bge 62f\n"
+ "tbz x13, #1, 60f\n"
+ "str d24, [x10], #0x8\n"
+ "str d25, [x26], #0x8\n"
+ "str d26, [x25], #0x8\n"
+ "tbz x13, #0, 61f\n"
+ "st1 { v24.s }[2], [x10]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "b 61f\n"
+ "60:" // Height 3: Partial direct writeback: partial_1_0
+ "str s24, [x10, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "61:" // Height 3: Partial direct writeback: Done
+ "b 63f\n"
+ "62:" // Height 3: Full writeback
+ "str q24, [x10, #0x0]\n"
+ "add x10, x10, #0x10\n"
+ "str q25, [x26, #0x0]\n"
+ "str q26, [x25, #0x0]\n"
+ "63:" // Height 3: Writeback done
+ "subs x13, x13, #0x4\n"
+ "bgt 44b\n"
+ "b 170f\n"
+ "64:" // Height 4
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "65:" // Height 4: Column loop
+ "cbz x11, 66f\n"
+ "ldr q24, [x11, #0x0]\n"
"mov v25.16b, v24.16b\n"
- "add x8, x8, #0x10\n"
+ "add x11, x11, #0x10\n"
"mov v26.16b, v24.16b\n"
"mov v27.16b, v24.16b\n"
- "b 75f\n"
- "70:" // Height 4: no bias
- "tbz %x[flags], #0, 74f\n"
- "cmp x6, #0x4\n"
- "bge 73f\n"
- "tbz x6, #1, 71f\n"
- "ldr d24, [x17], #0x8\n"
- "ldr d25, [x13], #0x8\n"
- "ldr d26, [x11], #0x8\n"
- "ldr d27, [x9], #0x8\n"
+ "b 71f\n"
+ "66:" // Height 4: no bias
+ "tbz %x[flags], #0, 70f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x13, #0x4\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "bge 69f\n"
+ "tbz x13, #1, 67f\n"
+ "ldr d24, [x10], #0x8\n"
+ "ldr d25, [x26], #0x8\n"
"mov x19, #0x8\n"
- "tbz x6, #0, 72f\n"
- "ld1 { v24.s }[2], [x17]\n"
- "ld1 { v25.s }[2], [x13]\n"
- "ld1 { v26.s }[2], [x11]\n"
- "ld1 { v27.s }[2], [x9]\n"
- "b 72f\n"
- "71:" // Height 4: Partial accumulate: partial_1_0
+ "ldr d26, [x25], #0x8\n"
+ "ldr d27, [x24], #0x8\n"
+ "tbz x13, #0, 68f\n"
+ "ld1 { v24.s }[2], [x10]\n"
+ "ld1 { v25.s }[2], [x26]\n"
+ "ld1 { v26.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x24]\n"
+ "b 68f\n"
+ "67:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s24, [x10, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s24, [x17, #0x0]\n"
- "ldr s25, [x13, #0x0]\n"
- "ldr s26, [x11, #0x0]\n"
- "ldr s27, [x9, #0x0]\n"
- "72:" // Height 4: Partial accumulate: Done
- "sub x17, x17, x19\n"
- "sub x13, x13, x19\n"
- "sub x11, x11, x19\n"
- "sub x9, x9, x19\n"
- "b 75f\n"
- "73:" // Height 4: full accumulate
- "ldr q24, [x17, #0x0]\n"
- "ldr q25, [x13, #0x0]\n"
- "ldr q26, [x11, #0x0]\n"
- "ldr q27, [x9, #0x0]\n"
- "b 75f\n"
- "74:" // Height 4: no accumulate
+ "ldr s25, [x26, #0x0]\n"
+ "ldr s26, [x25, #0x0]\n"
+ "ldr s27, [x24, #0x0]\n"
+ "68:" // Height 4: Partial accumulate: Done
+ "sub x10, x10, x19\n"
+ "b 71f\n"
+ "69:" // Height 4: full accumulate
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x26, #0x0]\n"
+ "ldr q26, [x25, #0x0]\n"
+ "ldr q27, [x24, #0x0]\n"
+ "b 71f\n"
+ "70:" // Height 4: no accumulate
"movi v24.16b, #0x0\n"
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
"movi v27.16b, #0x0\n"
- "75:" // Height 4: setup done
- "mov x16, #0x0\n"
- "76:" // Height 4: String loop
+ "71:" // Height 4: setup done
+ "mov x9, #0x0\n"
+ "72:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 77f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 73f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "cbnz x16, 78f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x9, 74f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "b 78f\n"
- "77:" // Height 4: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "78:" // Height 4: input setup done
- "cmp x15, #0x4\n"
- "blt 81f\n"
- "cmp x15, #0x8\n"
- "blt 80f\n"
- "79:" // Height 4: Multiply loop: Main loop head
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q8, [x7, #0x0]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 74f\n"
+ "73:" // Height 4: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "74:" // Height 4: input setup done
+ "cmp x28, #0x4\n"
+ "blt 77f\n"
+ "ldr q0, [x27, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x28, #0x8\n"
+ "ldr q2, [x25, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "blt 76f\n"
+ "75:" // Height 4: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x7, #0x10]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "add x27, x27, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x7, #0x20]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x26, x26, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x7, #0x30]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x25, x25, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "add x12, x12, #0x10\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x28, x28, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "cmp x28, #0x8\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [x27, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr q2, [x25, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "bge 75b\n"
+ "76:" // Height 4: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "sub x28, x28, #0x4\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "sub x15, x15, #0x4\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "cmp x15, #0x8\n"
- "add x7, x7, #0x40\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
@@ -735,67 +734,35 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v25.4s, v11.4s, v1.s[3]\n"
"fmla v26.4s, v11.4s, v2.s[3]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
- "bge 79b\n"
- "80:" // Height 4: Multiply loop: Single iteration only
- "sub x15, x15, #0x4\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q12, [x7, #0x0]\n"
+ "77:" // Height 4: Multiply loop: Main loop skip
+ "cbz x28, 79f\n"
+ "78:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x27], #0x4\n"
+ "sub x28, x28, #0x1\n"
+ "ldr s1, [x26], #0x4\n"
+ "ldr s2, [x25], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q12, [x12, #0x0]\n"
"fmla v24.4s, v12.4s, v0.s[0]\n"
- "ldr q13, [x7, #0x10]\n"
+ "add x12, x12, #0x10\n"
"fmla v25.4s, v12.4s, v1.s[0]\n"
- "ldr q14, [x7, #0x20]\n"
"fmla v26.4s, v12.4s, v2.s[0]\n"
- "ldr q15, [x7, #0x30]\n"
"fmla v27.4s, v12.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "fmla v24.4s, v13.4s, v0.s[1]\n"
- "add x12, x12, #0x10\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "fmla v25.4s, v13.4s, v1.s[1]\n"
- "add x10, x10, #0x10\n"
- "fmla v26.4s, v13.4s, v2.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
- "fmla v27.4s, v13.4s, v3.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x7, x7, #0x40\n"
- "fmla v24.4s, v14.4s, v0.s[2]\n"
- "fmla v25.4s, v14.4s, v1.s[2]\n"
- "fmla v26.4s, v14.4s, v2.s[2]\n"
- "fmla v27.4s, v14.4s, v3.s[2]\n"
- "fmla v24.4s, v15.4s, v0.s[3]\n"
- "fmla v25.4s, v15.4s, v1.s[3]\n"
- "fmla v26.4s, v15.4s, v2.s[3]\n"
- "fmla v27.4s, v15.4s, v3.s[3]\n"
- "81:" // Height 4: Multiply loop: Main loop skip
- "cbz x15, 83f\n"
- "82:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x14], #0x4\n"
- "ldr s1, [x12], #0x4\n"
- "ldr s2, [x10], #0x4\n"
- "ldr s3, [x28], #0x4\n"
- "ldr q16, [x7, #0x0]\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "sub x15, x15, #0x1\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add x7, x7, #0x10\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "cbnz x15, 82b\n"
- "83:" // Height 4: Multiply loop: No odd multiplies
+ "cbnz x28, 78b\n"
+ "79:" // Height 4: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x16, x16, #0x1\n"
- "cmp x16, x19\n"
- "bne 76b\n"
- "prfm pstl1keep, [x17, #0x0]\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x11, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "tbz %x[flags], #1, 84f\n"
+ "add x9, x9, #0x1\n"
+ "cmp x9, x19\n"
+ "bne 72b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "tbz %x[flags], #1, 80f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v17.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -808,184 +775,207 @@ void a64_hybrid_fp32_mla_8x4 (
"fmax v25.4s, v25.4s, v17.4s\n"
"fmax v26.4s, v26.4s, v17.4s\n"
"fmax v27.4s, v27.4s, v17.4s\n"
- "84:" // Height 4: No activation
- "cmp x6, #0x4\n"
- "bge 87f\n"
- "tbz x6, #1, 85f\n"
- "str d24, [x17], #0x8\n"
- "str d25, [x13], #0x8\n"
- "str d26, [x11], #0x8\n"
- "str d27, [x9], #0x8\n"
- "tbz x6, #0, 86f\n"
- "st1 { v24.s }[2], [x17]\n"
- "st1 { v25.s }[2], [x13]\n"
- "st1 { v26.s }[2], [x11]\n"
- "st1 { v27.s }[2], [x9]\n"
- "b 86f\n"
- "85:" // Height 4: Partial direct writeback: partial_1_0
- "str s24, [x17, #0x0]\n"
- "str s25, [x13, #0x0]\n"
- "str s26, [x11, #0x0]\n"
- "str s27, [x9, #0x0]\n"
- "86:" // Height 4: Partial direct writeback: Done
- "b 88f\n"
- "87:" // Height 4: Full writeback
- "str q24, [x17, #0x0]\n"
- "str q25, [x13, #0x0]\n"
- "str q26, [x11, #0x0]\n"
- "str q27, [x9, #0x0]\n"
- "add x17, x17, #0x10\n"
- "add x13, x13, #0x10\n"
- "add x11, x11, #0x10\n"
- "add x9, x9, #0x10\n"
- "88:" // Height 4: Writeback done
- "subs x6, x6, #0x4\n"
- "bgt 69b\n"
- "b 178f\n"
- "89:" // Height 5
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 90f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "ldr x27, [%x[output_ptr], #0x20]\n"
- "add x11, x11, x19, LSL #2\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 91f\n"
- "90:" // Height 5: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "91:" // Height 5: Column loop
- "cbz x8, 92f\n"
- "ldr q24, [x8, #0x0]\n"
+ "80:" // Height 4: No activation
+ "cmp x13, #0x4\n"
+ "bge 83f\n"
+ "tbz x13, #1, 81f\n"
+ "str d24, [x10], #0x8\n"
+ "str d25, [x26], #0x8\n"
+ "str d26, [x25], #0x8\n"
+ "str d27, [x24], #0x8\n"
+ "tbz x13, #0, 82f\n"
+ "st1 { v24.s }[2], [x10]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "b 82f\n"
+ "81:" // Height 4: Partial direct writeback: partial_1_0
+ "str s24, [x10, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "82:" // Height 4: Partial direct writeback: Done
+ "b 84f\n"
+ "83:" // Height 4: Full writeback
+ "str q24, [x10, #0x0]\n"
+ "add x10, x10, #0x10\n"
+ "str q25, [x26, #0x0]\n"
+ "str q26, [x25, #0x0]\n"
+ "str q27, [x24, #0x0]\n"
+ "84:" // Height 4: Writeback done
+ "subs x13, x13, #0x4\n"
+ "bgt 65b\n"
+ "b 170f\n"
+ "85:" // Height 5
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "86:" // Height 5: Column loop
+ "cbz x11, 87f\n"
+ "ldr q24, [x11, #0x0]\n"
"mov v25.16b, v24.16b\n"
- "add x8, x8, #0x10\n"
+ "add x11, x11, #0x10\n"
"mov v26.16b, v24.16b\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
- "b 97f\n"
- "92:" // Height 5: no bias
- "tbz %x[flags], #0, 96f\n"
- "cmp x6, #0x4\n"
- "bge 95f\n"
- "tbz x6, #1, 93f\n"
- "ldr d24, [x17], #0x8\n"
- "ldr d25, [x13], #0x8\n"
- "ldr d26, [x11], #0x8\n"
- "ldr d27, [x9], #0x8\n"
- "ldr d28, [x27], #0x8\n"
+ "b 92f\n"
+ "87:" // Height 5: no bias
+ "tbz %x[flags], #0, 91f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x13, #0x4\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "bge 90f\n"
+ "tbz x13, #1, 88f\n"
+ "ldr d24, [x10], #0x8\n"
+ "ldr d25, [x26], #0x8\n"
"mov x19, #0x8\n"
- "tbz x6, #0, 94f\n"
- "ld1 { v24.s }[2], [x17]\n"
- "ld1 { v25.s }[2], [x13]\n"
- "ld1 { v26.s }[2], [x11]\n"
- "ld1 { v27.s }[2], [x9]\n"
- "ld1 { v28.s }[2], [x27]\n"
- "b 94f\n"
- "93:" // Height 5: Partial accumulate: partial_1_0
+ "ldr d26, [x25], #0x8\n"
+ "ldr d27, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "tbz x13, #0, 89f\n"
+ "ld1 { v24.s }[2], [x10]\n"
+ "ld1 { v25.s }[2], [x26]\n"
+ "ld1 { v26.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x24]\n"
+ "ld1 { v28.s }[2], [x23]\n"
+ "b 89f\n"
+ "88:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s24, [x10, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s24, [x17, #0x0]\n"
- "ldr s25, [x13, #0x0]\n"
- "ldr s26, [x11, #0x0]\n"
- "ldr s27, [x9, #0x0]\n"
- "ldr s28, [x27, #0x0]\n"
- "94:" // Height 5: Partial accumulate: Done
- "sub x17, x17, x19\n"
- "sub x13, x13, x19\n"
- "sub x11, x11, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "b 97f\n"
- "95:" // Height 5: full accumulate
- "ldr q24, [x17, #0x0]\n"
- "ldr q25, [x13, #0x0]\n"
- "ldr q26, [x11, #0x0]\n"
- "ldr q27, [x9, #0x0]\n"
- "ldr q28, [x27, #0x0]\n"
- "b 97f\n"
- "96:" // Height 5: no accumulate
+ "ldr s25, [x26, #0x0]\n"
+ "ldr s26, [x25, #0x0]\n"
+ "ldr s27, [x24, #0x0]\n"
+ "ldr s28, [x23, #0x0]\n"
+ "89:" // Height 5: Partial accumulate: Done
+ "sub x10, x10, x19\n"
+ "b 92f\n"
+ "90:" // Height 5: full accumulate
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x26, #0x0]\n"
+ "ldr q26, [x25, #0x0]\n"
+ "ldr q27, [x24, #0x0]\n"
+ "ldr q28, [x23, #0x0]\n"
+ "b 92f\n"
+ "91:" // Height 5: no accumulate
"movi v24.16b, #0x0\n"
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
"movi v27.16b, #0x0\n"
"movi v28.16b, #0x0\n"
- "97:" // Height 5: setup done
- "mov x16, #0x0\n"
- "98:" // Height 5: String loop
+ "92:" // Height 5: setup done
+ "mov x9, #0x0\n"
+ "93:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 99f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 94f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "ldr x26, [x20, #0x20]\n"
- "cbnz x16, 100f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x23, [x20, #0x20]\n"
+ "cbnz x9, 95f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
"add x26, x26, x19, LSL #2\n"
- "b 100f\n"
- "99:" // Height 5: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "100:" // Height 5: input setup done
- "cmp x15, #0x4\n"
- "blt 103f\n"
- "cmp x15, #0x8\n"
- "blt 102f\n"
- "101:" // Height 5: Multiply loop: Main loop head
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q4, [x26, #0x0]\n"
- "ldr q8, [x7, #0x0]\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 95f\n"
+ "94:" // Height 5: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "95:" // Height 5: input setup done
+ "cmp x28, #0x4\n"
+ "blt 98f\n"
+ "ldr q0, [x27, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x28, #0x8\n"
+ "ldr q2, [x25, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "blt 97f\n"
+ "96:" // Height 5: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x7, #0x10]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "add x27, x27, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x7, #0x20]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x26, x26, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x7, #0x30]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x25, x25, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x28, x28, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x28, #0x8\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x12, x12, #0x40\n"
"fmla v27.4s, v9.4s, v3.s[1]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [x27, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr q2, [x25, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "bge 96b\n"
+ "97:" // Height 5: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "sub x28, x28, #0x4\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
- "sub x15, x15, #0x4\n"
+ "add x24, x24, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "cmp x15, #0x8\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
- "add x7, x7, #0x40\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
"fmla v27.4s, v10.4s, v3.s[2]\n"
@@ -995,77 +985,39 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v26.4s, v11.4s, v2.s[3]\n"
"fmla v27.4s, v11.4s, v3.s[3]\n"
"fmla v28.4s, v11.4s, v4.s[3]\n"
- "bge 101b\n"
- "102:" // Height 5: Multiply loop: Single iteration only
- "sub x15, x15, #0x4\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q4, [x26, #0x0]\n"
- "ldr q12, [x7, #0x0]\n"
+ "98:" // Height 5: Multiply loop: Main loop skip
+ "cbz x28, 100f\n"
+ "99:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x27], #0x4\n"
+ "sub x28, x28, #0x1\n"
+ "ldr s1, [x26], #0x4\n"
+ "ldr s2, [x25], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr q12, [x12, #0x0]\n"
"fmla v24.4s, v12.4s, v0.s[0]\n"
- "ldr q13, [x7, #0x10]\n"
+ "add x12, x12, #0x10\n"
"fmla v25.4s, v12.4s, v1.s[0]\n"
- "ldr q14, [x7, #0x20]\n"
"fmla v26.4s, v12.4s, v2.s[0]\n"
- "ldr q15, [x7, #0x30]\n"
"fmla v27.4s, v12.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
"fmla v28.4s, v12.4s, v4.s[0]\n"
- "add x12, x12, #0x10\n"
- "fmla v24.4s, v13.4s, v0.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
- "fmla v25.4s, v13.4s, v1.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
- "fmla v26.4s, v13.4s, v2.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- "fmla v27.4s, v13.4s, v3.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x7, x7, #0x40\n"
- "fmla v28.4s, v13.4s, v4.s[1]\n"
- "fmla v24.4s, v14.4s, v0.s[2]\n"
- "fmla v25.4s, v14.4s, v1.s[2]\n"
- "fmla v26.4s, v14.4s, v2.s[2]\n"
- "fmla v27.4s, v14.4s, v3.s[2]\n"
- "fmla v28.4s, v14.4s, v4.s[2]\n"
- "fmla v24.4s, v15.4s, v0.s[3]\n"
- "fmla v25.4s, v15.4s, v1.s[3]\n"
- "fmla v26.4s, v15.4s, v2.s[3]\n"
- "fmla v27.4s, v15.4s, v3.s[3]\n"
- "fmla v28.4s, v15.4s, v4.s[3]\n"
- "103:" // Height 5: Multiply loop: Main loop skip
- "cbz x15, 105f\n"
- "104:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x14], #0x4\n"
- "ldr s1, [x12], #0x4\n"
- "ldr s2, [x10], #0x4\n"
- "ldr s3, [x28], #0x4\n"
- "ldr s4, [x26], #0x4\n"
- "ldr q16, [x7, #0x0]\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "sub x15, x15, #0x1\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add x7, x7, #0x10\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "cbnz x15, 104b\n"
- "105:" // Height 5: Multiply loop: No odd multiplies
+ "cbnz x28, 99b\n"
+ "100:" // Height 5: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x16, x16, #0x1\n"
- "cmp x16, x19\n"
- "bne 98b\n"
- "prfm pstl1keep, [x17, #0x0]\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x11, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "tbz %x[flags], #1, 106f\n"
+ "add x9, x9, #0x1\n"
+ "cmp x9, x19\n"
+ "bne 93b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 101f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v17.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1080,206 +1032,232 @@ void a64_hybrid_fp32_mla_8x4 (
"fmax v27.4s, v27.4s, v17.4s\n"
"fmin v28.4s, v28.4s, v16.4s\n"
"fmax v28.4s, v28.4s, v17.4s\n"
- "106:" // Height 5: No activation
- "cmp x6, #0x4\n"
- "bge 109f\n"
- "tbz x6, #1, 107f\n"
- "str d24, [x17], #0x8\n"
- "str d25, [x13], #0x8\n"
- "str d26, [x11], #0x8\n"
- "str d27, [x9], #0x8\n"
- "str d28, [x27], #0x8\n"
- "tbz x6, #0, 108f\n"
- "st1 { v24.s }[2], [x17]\n"
- "st1 { v25.s }[2], [x13]\n"
- "st1 { v26.s }[2], [x11]\n"
- "st1 { v27.s }[2], [x9]\n"
- "st1 { v28.s }[2], [x27]\n"
- "b 108f\n"
- "107:" // Height 5: Partial direct writeback: partial_1_0
- "str s24, [x17, #0x0]\n"
- "str s25, [x13, #0x0]\n"
- "str s26, [x11, #0x0]\n"
- "str s27, [x9, #0x0]\n"
- "str s28, [x27, #0x0]\n"
- "108:" // Height 5: Partial direct writeback: Done
- "b 110f\n"
- "109:" // Height 5: Full writeback
- "str q24, [x17, #0x0]\n"
- "str q25, [x13, #0x0]\n"
- "str q26, [x11, #0x0]\n"
- "str q27, [x9, #0x0]\n"
- "str q28, [x27, #0x0]\n"
- "add x17, x17, #0x10\n"
- "add x13, x13, #0x10\n"
- "add x11, x11, #0x10\n"
- "add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
- "110:" // Height 5: Writeback done
- "subs x6, x6, #0x4\n"
- "bgt 91b\n"
- "b 178f\n"
- "111:" // Height 6
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 112f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "ldr x27, [%x[output_ptr], #0x20]\n"
- "add x11, x11, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x28]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 113f\n"
- "112:" // Height 6: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "113:" // Height 6: Column loop
- "cbz x8, 114f\n"
- "ldr q24, [x8, #0x0]\n"
+ "101:" // Height 5: No activation
+ "cmp x13, #0x4\n"
+ "bge 104f\n"
+ "tbz x13, #1, 102f\n"
+ "str d24, [x10], #0x8\n"
+ "str d25, [x26], #0x8\n"
+ "str d26, [x25], #0x8\n"
+ "str d27, [x24], #0x8\n"
+ "str d28, [x23], #0x8\n"
+ "tbz x13, #0, 103f\n"
+ "st1 { v24.s }[2], [x10]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "b 103f\n"
+ "102:" // Height 5: Partial direct writeback: partial_1_0
+ "str s24, [x10, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "103:" // Height 5: Partial direct writeback: Done
+ "b 105f\n"
+ "104:" // Height 5: Full writeback
+ "str q24, [x10, #0x0]\n"
+ "add x10, x10, #0x10\n"
+ "str q25, [x26, #0x0]\n"
+ "str q26, [x25, #0x0]\n"
+ "str q27, [x24, #0x0]\n"
+ "str q28, [x23, #0x0]\n"
+ "105:" // Height 5: Writeback done
+ "subs x13, x13, #0x4\n"
+ "bgt 86b\n"
+ "b 170f\n"
+ "106:" // Height 6
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "107:" // Height 6: Column loop
+ "cbz x11, 108f\n"
+ "ldr q24, [x11, #0x0]\n"
"mov v25.16b, v24.16b\n"
- "add x8, x8, #0x10\n"
+ "add x11, x11, #0x10\n"
"mov v26.16b, v24.16b\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
- "b 119f\n"
- "114:" // Height 6: no bias
- "tbz %x[flags], #0, 118f\n"
- "cmp x6, #0x4\n"
- "bge 117f\n"
- "tbz x6, #1, 115f\n"
- "ldr d24, [x17], #0x8\n"
- "ldr d25, [x13], #0x8\n"
- "ldr d26, [x11], #0x8\n"
- "ldr d27, [x9], #0x8\n"
- "ldr d28, [x27], #0x8\n"
- "ldr d29, [x25], #0x8\n"
+ "b 113f\n"
+ "108:" // Height 6: no bias
+ "tbz %x[flags], #0, 112f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x13, #0x4\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "bge 111f\n"
+ "tbz x13, #1, 109f\n"
+ "ldr d24, [x10], #0x8\n"
+ "ldr d25, [x26], #0x8\n"
"mov x19, #0x8\n"
- "tbz x6, #0, 116f\n"
- "ld1 { v24.s }[2], [x17]\n"
- "ld1 { v25.s }[2], [x13]\n"
- "ld1 { v26.s }[2], [x11]\n"
- "ld1 { v27.s }[2], [x9]\n"
- "ld1 { v28.s }[2], [x27]\n"
- "ld1 { v29.s }[2], [x25]\n"
- "b 116f\n"
- "115:" // Height 6: Partial accumulate: partial_1_0
+ "ldr d26, [x25], #0x8\n"
+ "ldr d27, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "ldr d29, [x22], #0x8\n"
+ "tbz x13, #0, 110f\n"
+ "ld1 { v24.s }[2], [x10]\n"
+ "ld1 { v25.s }[2], [x26]\n"
+ "ld1 { v26.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x24]\n"
+ "ld1 { v28.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x22]\n"
+ "b 110f\n"
+ "109:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s24, [x10, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s24, [x17, #0x0]\n"
- "ldr s25, [x13, #0x0]\n"
- "ldr s26, [x11, #0x0]\n"
- "ldr s27, [x9, #0x0]\n"
- "ldr s28, [x27, #0x0]\n"
- "ldr s29, [x25, #0x0]\n"
- "116:" // Height 6: Partial accumulate: Done
- "sub x17, x17, x19\n"
- "sub x13, x13, x19\n"
- "sub x11, x11, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "b 119f\n"
- "117:" // Height 6: full accumulate
- "ldr q24, [x17, #0x0]\n"
- "ldr q25, [x13, #0x0]\n"
- "ldr q26, [x11, #0x0]\n"
- "ldr q27, [x9, #0x0]\n"
- "ldr q28, [x27, #0x0]\n"
- "ldr q29, [x25, #0x0]\n"
- "b 119f\n"
- "118:" // Height 6: no accumulate
+ "ldr s25, [x26, #0x0]\n"
+ "ldr s26, [x25, #0x0]\n"
+ "ldr s27, [x24, #0x0]\n"
+ "ldr s28, [x23, #0x0]\n"
+ "ldr s29, [x22, #0x0]\n"
+ "110:" // Height 6: Partial accumulate: Done
+ "sub x10, x10, x19\n"
+ "b 113f\n"
+ "111:" // Height 6: full accumulate
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x26, #0x0]\n"
+ "ldr q26, [x25, #0x0]\n"
+ "ldr q27, [x24, #0x0]\n"
+ "ldr q28, [x23, #0x0]\n"
+ "ldr q29, [x22, #0x0]\n"
+ "b 113f\n"
+ "112:" // Height 6: no accumulate
"movi v24.16b, #0x0\n"
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
"movi v27.16b, #0x0\n"
"movi v28.16b, #0x0\n"
"movi v29.16b, #0x0\n"
- "119:" // Height 6: setup done
- "mov x16, #0x0\n"
- "120:" // Height 6: String loop
+ "113:" // Height 6: setup done
+ "mov x9, #0x0\n"
+ "114:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 121f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 115f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "ldr x26, [x20, #0x20]\n"
- "ldr x24, [x20, #0x28]\n"
- "cbnz x16, 122f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x23, [x20, #0x20]\n"
+ "ldr x22, [x20, #0x28]\n"
+ "cbnz x9, 116f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
"add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
- "b 122f\n"
- "121:" // Height 6: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "122:" // Height 6: input setup done
- "cmp x15, #0x4\n"
- "blt 125f\n"
- "cmp x15, #0x8\n"
- "blt 124f\n"
- "123:" // Height 6: Multiply loop: Main loop head
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q4, [x26, #0x0]\n"
- "ldr q5, [x24, #0x0]\n"
- "ldr q8, [x7, #0x0]\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 116f\n"
+ "115:" // Height 6: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "116:" // Height 6: input setup done
+ "cmp x28, #0x4\n"
+ "blt 119f\n"
+ "ldr q0, [x27, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x28, #0x8\n"
+ "ldr q2, [x25, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "ldr q5, [x22, #0x0]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "blt 118f\n"
+ "117:" // Height 6: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x7, #0x10]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "add x27, x27, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x7, #0x20]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x26, x26, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x7, #0x30]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x25, x25, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v29.4s, v8.4s, v5.s[0]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x28, x28, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "cmp x28, #0x8\n"
"fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [x27, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr q2, [x25, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "ldr q5, [x22, #0x0]\n"
+ "bge 117b\n"
+ "118:" // Height 6: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "sub x28, x28, #0x4\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x24, x24, #0x10\n"
- "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "sub x15, x15, #0x4\n"
+ "add x22, x22, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "cmp x15, #0x8\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "add x7, x7, #0x40\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
"fmla v26.4s, v10.4s, v2.s[2]\n"
@@ -1292,87 +1270,43 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v27.4s, v11.4s, v3.s[3]\n"
"fmla v28.4s, v11.4s, v4.s[3]\n"
"fmla v29.4s, v11.4s, v5.s[3]\n"
- "bge 123b\n"
- "124:" // Height 6: Multiply loop: Single iteration only
- "sub x15, x15, #0x4\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q4, [x26, #0x0]\n"
- "ldr q5, [x24, #0x0]\n"
- "ldr q12, [x7, #0x0]\n"
+ "119:" // Height 6: Multiply loop: Main loop skip
+ "cbz x28, 121f\n"
+ "120:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x27], #0x4\n"
+ "sub x28, x28, #0x1\n"
+ "ldr s1, [x26], #0x4\n"
+ "ldr s2, [x25], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x22], #0x4\n"
+ "ldr q12, [x12, #0x0]\n"
"fmla v24.4s, v12.4s, v0.s[0]\n"
- "ldr q13, [x7, #0x10]\n"
+ "add x12, x12, #0x10\n"
"fmla v25.4s, v12.4s, v1.s[0]\n"
- "ldr q14, [x7, #0x20]\n"
"fmla v26.4s, v12.4s, v2.s[0]\n"
- "ldr q15, [x7, #0x30]\n"
"fmla v27.4s, v12.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
"fmla v28.4s, v12.4s, v4.s[0]\n"
- "add x12, x12, #0x10\n"
"fmla v29.4s, v12.4s, v5.s[0]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
- "fmla v24.4s, v13.4s, v0.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
- "fmla v25.4s, v13.4s, v1.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- "fmla v26.4s, v13.4s, v2.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x24, x24, #0x10\n"
- "fmla v27.4s, v13.4s, v3.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "add x7, x7, #0x40\n"
- "fmla v28.4s, v13.4s, v4.s[1]\n"
- "fmla v29.4s, v13.4s, v5.s[1]\n"
- "fmla v24.4s, v14.4s, v0.s[2]\n"
- "fmla v25.4s, v14.4s, v1.s[2]\n"
- "fmla v26.4s, v14.4s, v2.s[2]\n"
- "fmla v27.4s, v14.4s, v3.s[2]\n"
- "fmla v28.4s, v14.4s, v4.s[2]\n"
- "fmla v29.4s, v14.4s, v5.s[2]\n"
- "fmla v24.4s, v15.4s, v0.s[3]\n"
- "fmla v25.4s, v15.4s, v1.s[3]\n"
- "fmla v26.4s, v15.4s, v2.s[3]\n"
- "fmla v27.4s, v15.4s, v3.s[3]\n"
- "fmla v28.4s, v15.4s, v4.s[3]\n"
- "fmla v29.4s, v15.4s, v5.s[3]\n"
- "125:" // Height 6: Multiply loop: Main loop skip
- "cbz x15, 127f\n"
- "126:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x14], #0x4\n"
- "ldr s1, [x12], #0x4\n"
- "ldr s2, [x10], #0x4\n"
- "ldr s3, [x28], #0x4\n"
- "ldr s4, [x26], #0x4\n"
- "ldr s5, [x24], #0x4\n"
- "ldr q16, [x7, #0x0]\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "sub x15, x15, #0x1\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add x7, x7, #0x10\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "cbnz x15, 126b\n"
- "127:" // Height 6: Multiply loop: No odd multiplies
+ "cbnz x28, 120b\n"
+ "121:" // Height 6: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x16, x16, #0x1\n"
- "cmp x16, x19\n"
- "bne 120b\n"
- "prfm pstl1keep, [x17, #0x0]\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x11, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
+ "add x9, x9, #0x1\n"
+ "cmp x9, x19\n"
+ "bne 114b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
- "tbz %x[flags], #1, 128f\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "tbz %x[flags], #1, 122f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v17.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1389,140 +1323,112 @@ void a64_hybrid_fp32_mla_8x4 (
"fmin v29.4s, v29.4s, v16.4s\n"
"fmax v28.4s, v28.4s, v17.4s\n"
"fmax v29.4s, v29.4s, v17.4s\n"
- "128:" // Height 6: No activation
- "cmp x6, #0x4\n"
- "bge 131f\n"
- "tbz x6, #1, 129f\n"
- "str d24, [x17], #0x8\n"
- "str d25, [x13], #0x8\n"
- "str d26, [x11], #0x8\n"
- "str d27, [x9], #0x8\n"
- "str d28, [x27], #0x8\n"
- "str d29, [x25], #0x8\n"
- "tbz x6, #0, 130f\n"
- "st1 { v24.s }[2], [x17]\n"
- "st1 { v25.s }[2], [x13]\n"
- "st1 { v26.s }[2], [x11]\n"
- "st1 { v27.s }[2], [x9]\n"
- "st1 { v28.s }[2], [x27]\n"
- "st1 { v29.s }[2], [x25]\n"
- "b 130f\n"
- "129:" // Height 6: Partial direct writeback: partial_1_0
- "str s24, [x17, #0x0]\n"
- "str s25, [x13, #0x0]\n"
- "str s26, [x11, #0x0]\n"
- "str s27, [x9, #0x0]\n"
- "str s28, [x27, #0x0]\n"
- "str s29, [x25, #0x0]\n"
- "130:" // Height 6: Partial direct writeback: Done
- "b 132f\n"
- "131:" // Height 6: Full writeback
- "str q24, [x17, #0x0]\n"
- "str q25, [x13, #0x0]\n"
- "str q26, [x11, #0x0]\n"
- "str q27, [x9, #0x0]\n"
- "str q28, [x27, #0x0]\n"
- "str q29, [x25, #0x0]\n"
- "add x17, x17, #0x10\n"
- "add x13, x13, #0x10\n"
- "add x11, x11, #0x10\n"
- "add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "132:" // Height 6: Writeback done
- "subs x6, x6, #0x4\n"
- "bgt 113b\n"
- "b 178f\n"
- "133:" // Height 7
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 134f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "ldr x27, [%x[output_ptr], #0x20]\n"
- "add x11, x11, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x28]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x23, [%x[output_ptr], #0x30]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 135f\n"
- "134:" // Height 7: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "135:" // Height 7: Column loop
- "cbz x8, 136f\n"
- "ldr q24, [x8, #0x0]\n"
+ "122:" // Height 6: No activation
+ "cmp x13, #0x4\n"
+ "bge 125f\n"
+ "tbz x13, #1, 123f\n"
+ "str d24, [x10], #0x8\n"
+ "str d25, [x26], #0x8\n"
+ "str d26, [x25], #0x8\n"
+ "str d27, [x24], #0x8\n"
+ "str d28, [x23], #0x8\n"
+ "str d29, [x22], #0x8\n"
+ "tbz x13, #0, 124f\n"
+ "st1 { v24.s }[2], [x10]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "b 124f\n"
+ "123:" // Height 6: Partial direct writeback: partial_1_0
+ "str s24, [x10, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "124:" // Height 6: Partial direct writeback: Done
+ "b 126f\n"
+ "125:" // Height 6: Full writeback
+ "str q24, [x10, #0x0]\n"
+ "add x10, x10, #0x10\n"
+ "str q25, [x26, #0x0]\n"
+ "str q26, [x25, #0x0]\n"
+ "str q27, [x24, #0x0]\n"
+ "str q28, [x23, #0x0]\n"
+ "str q29, [x22, #0x0]\n"
+ "126:" // Height 6: Writeback done
+ "subs x13, x13, #0x4\n"
+ "bgt 107b\n"
+ "b 170f\n"
+ "127:" // Height 7
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "128:" // Height 7: Column loop
+ "cbz x11, 129f\n"
+ "ldr q24, [x11, #0x0]\n"
"mov v25.16b, v24.16b\n"
- "add x8, x8, #0x10\n"
+ "add x11, x11, #0x10\n"
"mov v26.16b, v24.16b\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
"mov v30.16b, v24.16b\n"
- "b 141f\n"
- "136:" // Height 7: no bias
- "tbz %x[flags], #0, 140f\n"
- "cmp x6, #0x4\n"
- "bge 139f\n"
- "tbz x6, #1, 137f\n"
- "ldr d24, [x17], #0x8\n"
- "ldr d25, [x13], #0x8\n"
- "ldr d26, [x11], #0x8\n"
- "ldr d27, [x9], #0x8\n"
- "ldr d28, [x27], #0x8\n"
- "ldr d29, [x25], #0x8\n"
- "ldr d30, [x23], #0x8\n"
+ "b 134f\n"
+ "129:" // Height 7: no bias
+ "tbz %x[flags], #0, 133f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x13, #0x4\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "bge 132f\n"
+ "tbz x13, #1, 130f\n"
+ "ldr d24, [x10], #0x8\n"
+ "ldr d25, [x26], #0x8\n"
"mov x19, #0x8\n"
- "tbz x6, #0, 138f\n"
- "ld1 { v24.s }[2], [x17]\n"
- "ld1 { v25.s }[2], [x13]\n"
- "ld1 { v26.s }[2], [x11]\n"
- "ld1 { v27.s }[2], [x9]\n"
- "ld1 { v28.s }[2], [x27]\n"
- "ld1 { v29.s }[2], [x25]\n"
- "ld1 { v30.s }[2], [x23]\n"
- "b 138f\n"
- "137:" // Height 7: Partial accumulate: partial_1_0
+ "ldr d26, [x25], #0x8\n"
+ "ldr d27, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "ldr d29, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "tbz x13, #0, 131f\n"
+ "ld1 { v24.s }[2], [x10]\n"
+ "ld1 { v25.s }[2], [x26]\n"
+ "ld1 { v26.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x24]\n"
+ "ld1 { v28.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 131f\n"
+ "130:" // Height 7: Partial accumulate: partial_1_0
+ "ldr s24, [x10, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s24, [x17, #0x0]\n"
- "ldr s25, [x13, #0x0]\n"
- "ldr s26, [x11, #0x0]\n"
- "ldr s27, [x9, #0x0]\n"
- "ldr s28, [x27, #0x0]\n"
- "ldr s29, [x25, #0x0]\n"
- "ldr s30, [x23, #0x0]\n"
- "138:" // Height 7: Partial accumulate: Done
- "sub x17, x17, x19\n"
- "sub x13, x13, x19\n"
- "sub x11, x11, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "b 141f\n"
- "139:" // Height 7: full accumulate
- "ldr q24, [x17, #0x0]\n"
- "ldr q25, [x13, #0x0]\n"
- "ldr q26, [x11, #0x0]\n"
- "ldr q27, [x9, #0x0]\n"
- "ldr q28, [x27, #0x0]\n"
- "ldr q29, [x25, #0x0]\n"
- "ldr q30, [x23, #0x0]\n"
- "b 141f\n"
- "140:" // Height 7: no accumulate
+ "ldr s25, [x26, #0x0]\n"
+ "ldr s26, [x25, #0x0]\n"
+ "ldr s27, [x24, #0x0]\n"
+ "ldr s28, [x23, #0x0]\n"
+ "ldr s29, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "131:" // Height 7: Partial accumulate: Done
+ "sub x10, x10, x19\n"
+ "b 134f\n"
+ "132:" // Height 7: full accumulate
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x26, #0x0]\n"
+ "ldr q26, [x25, #0x0]\n"
+ "ldr q27, [x24, #0x0]\n"
+ "ldr q28, [x23, #0x0]\n"
+ "ldr q29, [x22, #0x0]\n"
+ "ldr q30, [x21, #0x0]\n"
+ "b 134f\n"
+ "133:" // Height 7: no accumulate
"movi v24.16b, #0x0\n"
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
@@ -1530,87 +1436,144 @@ void a64_hybrid_fp32_mla_8x4 (
"movi v28.16b, #0x0\n"
"movi v29.16b, #0x0\n"
"movi v30.16b, #0x0\n"
- "141:" // Height 7: setup done
- "mov x16, #0x0\n"
- "142:" // Height 7: String loop
+ "134:" // Height 7: setup done
+ "mov x9, #0x0\n"
+ "135:" // Height 7: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 143f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 136f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "ldr x26, [x20, #0x20]\n"
- "ldr x24, [x20, #0x28]\n"
- "ldr x22, [x20, #0x30]\n"
- "cbnz x16, 144f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x23, [x20, #0x20]\n"
+ "ldr x22, [x20, #0x28]\n"
+ "ldr x21, [x20, #0x30]\n"
+ "cbnz x9, 137f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
"add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
"add x22, x22, x19, LSL #2\n"
- "b 144f\n"
- "143:" // Height 7: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "add x22, x24, x19, LSL #2\n"
- "144:" // Height 7: input setup done
- "cmp x15, #0x4\n"
- "blt 147f\n"
- "cmp x15, #0x8\n"
- "blt 146f\n"
- "145:" // Height 7: Multiply loop: Main loop head
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q4, [x26, #0x0]\n"
- "ldr q5, [x24, #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q8, [x7, #0x0]\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 137f\n"
+ "136:" // Height 7: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "137:" // Height 7: input setup done
+ "cmp x28, #0x4\n"
+ "blt 140f\n"
+ "ldr q0, [x27, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x28, #0x8\n"
+ "ldr q2, [x25, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "ldr q5, [x22, #0x0]\n"
+ "ldr q6, [x21, #0x0]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "blt 139f\n"
+ "138:" // Height 7: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x7, #0x10]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "add x27, x27, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x7, #0x20]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x26, x26, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x7, #0x30]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x25, x25, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v29.4s, v8.4s, v5.s[0]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla v30.4s, v8.4s, v6.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x21, x21, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x28, x28, #0x4\n"
"fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x28, #0x8\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [x27, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr q2, [x25, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "ldr q5, [x22, #0x0]\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "ldr q6, [x21, #0x0]\n"
+ "bge 138b\n"
+ "139:" // Height 7: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "sub x28, x28, #0x4\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x24, x24, #0x10\n"
- "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"add x22, x22, #0x10\n"
- "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x21, x21, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "sub x15, x15, #0x4\n"
+ "add x12, x12, #0x40\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "cmp x15, #0x8\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "add x7, x7, #0x40\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
"fmla v25.4s, v10.4s, v1.s[2]\n"
@@ -1626,97 +1589,47 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v28.4s, v11.4s, v4.s[3]\n"
"fmla v29.4s, v11.4s, v5.s[3]\n"
"fmla v30.4s, v11.4s, v6.s[3]\n"
- "bge 145b\n"
- "146:" // Height 7: Multiply loop: Single iteration only
- "sub x15, x15, #0x4\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q4, [x26, #0x0]\n"
- "ldr q5, [x24, #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q12, [x7, #0x0]\n"
+ "140:" // Height 7: Multiply loop: Main loop skip
+ "cbz x28, 142f\n"
+ "141:" // Height 7: Multiply loop: Odd block loop
+ "ldr s0, [x27], #0x4\n"
+ "sub x28, x28, #0x1\n"
+ "ldr s1, [x26], #0x4\n"
+ "ldr s2, [x25], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x22], #0x4\n"
+ "ldr s6, [x21], #0x4\n"
+ "ldr q12, [x12, #0x0]\n"
"fmla v24.4s, v12.4s, v0.s[0]\n"
- "ldr q13, [x7, #0x10]\n"
+ "add x12, x12, #0x10\n"
"fmla v25.4s, v12.4s, v1.s[0]\n"
- "ldr q14, [x7, #0x20]\n"
"fmla v26.4s, v12.4s, v2.s[0]\n"
- "ldr q15, [x7, #0x30]\n"
"fmla v27.4s, v12.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
"fmla v28.4s, v12.4s, v4.s[0]\n"
- "add x12, x12, #0x10\n"
"fmla v29.4s, v12.4s, v5.s[0]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
"fmla v30.4s, v12.4s, v6.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
- "fmla v24.4s, v13.4s, v0.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- "fmla v25.4s, v13.4s, v1.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x24, x24, #0x10\n"
- "fmla v26.4s, v13.4s, v2.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "add x22, x22, #0x10\n"
- "fmla v27.4s, v13.4s, v3.s[1]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "add x7, x7, #0x40\n"
- "fmla v28.4s, v13.4s, v4.s[1]\n"
- "fmla v29.4s, v13.4s, v5.s[1]\n"
- "fmla v30.4s, v13.4s, v6.s[1]\n"
- "fmla v24.4s, v14.4s, v0.s[2]\n"
- "fmla v25.4s, v14.4s, v1.s[2]\n"
- "fmla v26.4s, v14.4s, v2.s[2]\n"
- "fmla v27.4s, v14.4s, v3.s[2]\n"
- "fmla v28.4s, v14.4s, v4.s[2]\n"
- "fmla v29.4s, v14.4s, v5.s[2]\n"
- "fmla v30.4s, v14.4s, v6.s[2]\n"
- "fmla v24.4s, v15.4s, v0.s[3]\n"
- "fmla v25.4s, v15.4s, v1.s[3]\n"
- "fmla v26.4s, v15.4s, v2.s[3]\n"
- "fmla v27.4s, v15.4s, v3.s[3]\n"
- "fmla v28.4s, v15.4s, v4.s[3]\n"
- "fmla v29.4s, v15.4s, v5.s[3]\n"
- "fmla v30.4s, v15.4s, v6.s[3]\n"
- "147:" // Height 7: Multiply loop: Main loop skip
- "cbz x15, 149f\n"
- "148:" // Height 7: Multiply loop: Odd block loop
- "ldr s0, [x14], #0x4\n"
- "ldr s1, [x12], #0x4\n"
- "ldr s2, [x10], #0x4\n"
- "ldr s3, [x28], #0x4\n"
- "ldr s4, [x26], #0x4\n"
- "ldr s5, [x24], #0x4\n"
- "ldr s6, [x22], #0x4\n"
- "ldr q16, [x7, #0x0]\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "sub x15, x15, #0x1\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add x7, x7, #0x10\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "cbnz x15, 148b\n"
- "149:" // Height 7: Multiply loop: No odd multiplies
+ "cbnz x28, 141b\n"
+ "142:" // Height 7: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x16, x16, #0x1\n"
- "cmp x16, x19\n"
- "bne 142b\n"
- "prfm pstl1keep, [x17, #0x0]\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x11, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
+ "add x9, x9, #0x1\n"
+ "cmp x9, x19\n"
+ "bne 135b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "tbz %x[flags], #1, 150f\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 143f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v17.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1735,156 +1648,125 @@ void a64_hybrid_fp32_mla_8x4 (
"fmax v28.4s, v28.4s, v17.4s\n"
"fmax v29.4s, v29.4s, v17.4s\n"
"fmax v30.4s, v30.4s, v17.4s\n"
- "150:" // Height 7: No activation
- "cmp x6, #0x4\n"
- "bge 153f\n"
- "tbz x6, #1, 151f\n"
- "str d24, [x17], #0x8\n"
- "str d25, [x13], #0x8\n"
- "str d26, [x11], #0x8\n"
- "str d27, [x9], #0x8\n"
- "str d28, [x27], #0x8\n"
- "str d29, [x25], #0x8\n"
- "str d30, [x23], #0x8\n"
- "tbz x6, #0, 152f\n"
- "st1 { v24.s }[2], [x17]\n"
- "st1 { v25.s }[2], [x13]\n"
- "st1 { v26.s }[2], [x11]\n"
- "st1 { v27.s }[2], [x9]\n"
- "st1 { v28.s }[2], [x27]\n"
- "st1 { v29.s }[2], [x25]\n"
- "st1 { v30.s }[2], [x23]\n"
- "b 152f\n"
- "151:" // Height 7: Partial direct writeback: partial_1_0
- "str s24, [x17, #0x0]\n"
- "str s25, [x13, #0x0]\n"
- "str s26, [x11, #0x0]\n"
- "str s27, [x9, #0x0]\n"
- "str s28, [x27, #0x0]\n"
- "str s29, [x25, #0x0]\n"
- "str s30, [x23, #0x0]\n"
- "152:" // Height 7: Partial direct writeback: Done
- "b 154f\n"
- "153:" // Height 7: Full writeback
- "str q24, [x17, #0x0]\n"
- "str q25, [x13, #0x0]\n"
- "str q26, [x11, #0x0]\n"
- "str q27, [x9, #0x0]\n"
- "str q28, [x27, #0x0]\n"
- "str q29, [x25, #0x0]\n"
- "str q30, [x23, #0x0]\n"
- "add x17, x17, #0x10\n"
- "add x13, x13, #0x10\n"
- "add x11, x11, #0x10\n"
- "add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x23, x23, #0x10\n"
- "154:" // Height 7: Writeback done
- "subs x6, x6, #0x4\n"
- "bgt 135b\n"
- "b 178f\n"
- "155:" // Height 8
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 156f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "ldr x27, [%x[output_ptr], #0x20]\n"
- "add x11, x11, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x28]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x23, [%x[output_ptr], #0x30]\n"
- "ldr x21, [%x[output_ptr], #0x38]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add %x[output_ptr], %x[output_ptr], #0x40\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 157f\n"
- "156:" // Height 8: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "157:" // Height 8: Column loop
- "cbz x8, 158f\n"
- "ldr q24, [x8, #0x0]\n"
+ "143:" // Height 7: No activation
+ "cmp x13, #0x4\n"
+ "bge 146f\n"
+ "tbz x13, #1, 144f\n"
+ "str d24, [x10], #0x8\n"
+ "str d25, [x26], #0x8\n"
+ "str d26, [x25], #0x8\n"
+ "str d27, [x24], #0x8\n"
+ "str d28, [x23], #0x8\n"
+ "str d29, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x13, #0, 145f\n"
+ "st1 { v24.s }[2], [x10]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 145f\n"
+ "144:" // Height 7: Partial direct writeback: partial_1_0
+ "str s24, [x10, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "145:" // Height 7: Partial direct writeback: Done
+ "b 147f\n"
+ "146:" // Height 7: Full writeback
+ "str q24, [x10, #0x0]\n"
+ "add x10, x10, #0x10\n"
+ "str q25, [x26, #0x0]\n"
+ "str q26, [x25, #0x0]\n"
+ "str q27, [x24, #0x0]\n"
+ "str q28, [x23, #0x0]\n"
+ "str q29, [x22, #0x0]\n"
+ "str q30, [x21, #0x0]\n"
+ "147:" // Height 7: Writeback done
+ "subs x13, x13, #0x4\n"
+ "bgt 128b\n"
+ "b 170f\n"
+ "148:" // Height 8
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0x20\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "149:" // Height 8: Column loop
+ "cbz x11, 150f\n"
+ "ldr q24, [x11, #0x0]\n"
"mov v25.16b, v24.16b\n"
- "add x8, x8, #0x10\n"
+ "add x11, x11, #0x10\n"
"mov v26.16b, v24.16b\n"
"mov v27.16b, v24.16b\n"
"mov v28.16b, v24.16b\n"
"mov v29.16b, v24.16b\n"
"mov v30.16b, v24.16b\n"
"mov v31.16b, v24.16b\n"
- "b 163f\n"
- "158:" // Height 8: no bias
- "tbz %x[flags], #0, 162f\n"
- "cmp x6, #0x4\n"
- "bge 161f\n"
- "tbz x6, #1, 159f\n"
- "ldr d24, [x17], #0x8\n"
- "ldr d25, [x13], #0x8\n"
- "ldr d26, [x11], #0x8\n"
- "ldr d27, [x9], #0x8\n"
- "ldr d28, [x27], #0x8\n"
- "ldr d29, [x25], #0x8\n"
- "ldr d30, [x23], #0x8\n"
- "ldr d31, [x21], #0x8\n"
+ "b 155f\n"
+ "150:" // Height 8: no bias
+ "tbz %x[flags], #0, 154f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x13, #0x4\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "bge 153f\n"
+ "tbz x13, #1, 151f\n"
+ "ldr d24, [x10], #0x8\n"
+ "ldr d25, [x26], #0x8\n"
"mov x19, #0x8\n"
- "tbz x6, #0, 160f\n"
- "ld1 { v24.s }[2], [x17]\n"
- "ld1 { v25.s }[2], [x13]\n"
- "ld1 { v26.s }[2], [x11]\n"
- "ld1 { v27.s }[2], [x9]\n"
- "ld1 { v28.s }[2], [x27]\n"
- "ld1 { v29.s }[2], [x25]\n"
- "ld1 { v30.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x21]\n"
- "b 160f\n"
- "159:" // Height 8: Partial accumulate: partial_1_0
+ "ldr d26, [x25], #0x8\n"
+ "ldr d27, [x24], #0x8\n"
+ "ldr d28, [x23], #0x8\n"
+ "ldr d29, [x22], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "ldr d31, [x20], #0x8\n"
+ "tbz x13, #0, 152f\n"
+ "ld1 { v24.s }[2], [x10]\n"
+ "ld1 { v25.s }[2], [x26]\n"
+ "ld1 { v26.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x24]\n"
+ "ld1 { v28.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x22]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "ld1 { v31.s }[2], [x20]\n"
+ "b 152f\n"
+ "151:" // Height 8: Partial accumulate: partial_1_0
+ "ldr s24, [x10, #0x0]\n"
"mov x19, #0x0\n"
- "ldr s24, [x17, #0x0]\n"
- "ldr s25, [x13, #0x0]\n"
- "ldr s26, [x11, #0x0]\n"
- "ldr s27, [x9, #0x0]\n"
- "ldr s28, [x27, #0x0]\n"
- "ldr s29, [x25, #0x0]\n"
- "ldr s30, [x23, #0x0]\n"
- "ldr s31, [x21, #0x0]\n"
- "160:" // Height 8: Partial accumulate: Done
- "sub x17, x17, x19\n"
- "sub x13, x13, x19\n"
- "sub x11, x11, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "sub x21, x21, x19\n"
- "b 163f\n"
- "161:" // Height 8: full accumulate
- "ldr q24, [x17, #0x0]\n"
- "ldr q25, [x13, #0x0]\n"
- "ldr q26, [x11, #0x0]\n"
- "ldr q27, [x9, #0x0]\n"
- "ldr q28, [x27, #0x0]\n"
- "ldr q29, [x25, #0x0]\n"
- "ldr q30, [x23, #0x0]\n"
- "ldr q31, [x21, #0x0]\n"
- "b 163f\n"
- "162:" // Height 8: no accumulate
+ "ldr s25, [x26, #0x0]\n"
+ "ldr s26, [x25, #0x0]\n"
+ "ldr s27, [x24, #0x0]\n"
+ "ldr s28, [x23, #0x0]\n"
+ "ldr s29, [x22, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "ldr s31, [x20, #0x0]\n"
+ "152:" // Height 8: Partial accumulate: Done
+ "sub x10, x10, x19\n"
+ "b 155f\n"
+ "153:" // Height 8: full accumulate
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x26, #0x0]\n"
+ "ldr q26, [x25, #0x0]\n"
+ "ldr q27, [x24, #0x0]\n"
+ "ldr q28, [x23, #0x0]\n"
+ "ldr q29, [x22, #0x0]\n"
+ "ldr q30, [x21, #0x0]\n"
+ "ldr q31, [x20, #0x0]\n"
+ "b 155f\n"
+ "154:" // Height 8: no accumulate
"movi v24.16b, #0x0\n"
"movi v25.16b, #0x0\n"
"movi v26.16b, #0x0\n"
@@ -1893,94 +1775,158 @@ void a64_hybrid_fp32_mla_8x4 (
"movi v29.16b, #0x0\n"
"movi v30.16b, #0x0\n"
"movi v31.16b, #0x0\n"
- "163:" // Height 8: setup done
- "mov x16, #0x0\n"
- "164:" // Height 8: String loop
+ "155:" // Height 8: setup done
+ "mov x9, #0x0\n"
+ "156:" // Height 8: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 165f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 157f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "ldr x26, [x20, #0x20]\n"
- "ldr x24, [x20, #0x28]\n"
- "ldr x22, [x20, #0x30]\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x23, [x20, #0x20]\n"
+ "ldr x22, [x20, #0x28]\n"
+ "ldr x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x38]\n"
- "cbnz x16, 166f\n"
+ "cbnz x9, 158f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
"add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
"add x22, x22, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
"add x20, x20, x19, LSL #2\n"
- "b 166f\n"
- "165:" // Height 8: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "add x22, x24, x19, LSL #2\n"
- "add x20, x22, x19, LSL #2\n"
- "166:" // Height 8: input setup done
- "cmp x15, #0x4\n"
- "blt 169f\n"
- "cmp x15, #0x8\n"
- "blt 168f\n"
- "167:" // Height 8: Multiply loop: Main loop head
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q4, [x26, #0x0]\n"
- "ldr q5, [x24, #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
+ "b 158f\n"
+ "157:" // Height 8: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "158:" // Height 8: input setup done
+ "cmp x28, #0x4\n"
+ "blt 161f\n"
+ "ldr q0, [x27, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x28, #0x8\n"
+ "ldr q2, [x25, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "ldr q5, [x22, #0x0]\n"
+ "ldr q6, [x21, #0x0]\n"
"ldr q7, [x20, #0x0]\n"
- "ldr q8, [x7, #0x0]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "blt 160f\n"
+ "159:" // Height 8: Multiply loop: Main loop head
"fmla v24.4s, v8.4s, v0.s[0]\n"
- "ldr q9, [x7, #0x10]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "add x27, x27, #0x10\n"
"fmla v25.4s, v8.4s, v1.s[0]\n"
- "ldr q10, [x7, #0x20]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x26, x26, #0x10\n"
"fmla v26.4s, v8.4s, v2.s[0]\n"
- "ldr q11, [x7, #0x30]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x25, x25, #0x10\n"
"fmla v27.4s, v8.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla v28.4s, v8.4s, v4.s[0]\n"
- "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla v29.4s, v8.4s, v5.s[0]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla v30.4s, v8.4s, v6.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x21, x21, #0x10\n"
"fmla v31.4s, v8.4s, v7.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x20, x20, #0x10\n"
"fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x28, x28, #0x4\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "cmp x28, #0x8\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "ldr q8, [x12, #0x0]\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "fmla v31.4s, v9.4s, v7.s[1]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "fmla v31.4s, v10.4s, v7.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr q0, [x27, #0x0]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "ldr q2, [x25, #0x0]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "ldr q4, [x23, #0x0]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "ldr q5, [x22, #0x0]\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "ldr q6, [x21, #0x0]\n"
+ "fmla v31.4s, v11.4s, v7.s[3]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "bge 159b\n"
+ "160:" // Height 8: Multiply loop: Single iteration only
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "sub x28, x28, #0x4\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
"prfm pldl1keep, [x26, #0x80]\n"
"add x24, x24, #0x10\n"
- "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
"add x22, x22, #0x10\n"
- "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "fmla v31.4s, v8.4s, v7.s[0]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x21, x21, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
"prfm pldl1keep, [x22, #0x80]\n"
"add x20, x20, #0x10\n"
- "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "add x12, x12, #0x40\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "sub x15, x15, #0x4\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
"fmla v28.4s, v9.4s, v4.s[1]\n"
- "cmp x15, #0x8\n"
"fmla v29.4s, v9.4s, v5.s[1]\n"
- "add x7, x7, #0x40\n"
"fmla v30.4s, v9.4s, v6.s[1]\n"
"fmla v31.4s, v9.4s, v7.s[1]\n"
"fmla v24.4s, v10.4s, v0.s[2]\n"
@@ -1999,107 +1945,51 @@ void a64_hybrid_fp32_mla_8x4 (
"fmla v29.4s, v11.4s, v5.s[3]\n"
"fmla v30.4s, v11.4s, v6.s[3]\n"
"fmla v31.4s, v11.4s, v7.s[3]\n"
- "bge 167b\n"
- "168:" // Height 8: Multiply loop: Single iteration only
- "sub x15, x15, #0x4\n"
- "ldr q0, [x14, #0x0]\n"
- "ldr q1, [x12, #0x0]\n"
- "ldr q2, [x10, #0x0]\n"
- "ldr q3, [x28, #0x0]\n"
- "ldr q4, [x26, #0x0]\n"
- "ldr q5, [x24, #0x0]\n"
- "ldr q6, [x22, #0x0]\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q12, [x7, #0x0]\n"
+ "161:" // Height 8: Multiply loop: Main loop skip
+ "cbz x28, 163f\n"
+ "162:" // Height 8: Multiply loop: Odd block loop
+ "ldr s0, [x27], #0x4\n"
+ "sub x28, x28, #0x1\n"
+ "ldr s1, [x26], #0x4\n"
+ "ldr s2, [x25], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s5, [x22], #0x4\n"
+ "ldr s6, [x21], #0x4\n"
+ "ldr s7, [x20], #0x4\n"
+ "ldr q12, [x12, #0x0]\n"
"fmla v24.4s, v12.4s, v0.s[0]\n"
- "ldr q13, [x7, #0x10]\n"
+ "add x12, x12, #0x10\n"
"fmla v25.4s, v12.4s, v1.s[0]\n"
- "ldr q14, [x7, #0x20]\n"
"fmla v26.4s, v12.4s, v2.s[0]\n"
- "ldr q15, [x7, #0x30]\n"
"fmla v27.4s, v12.4s, v3.s[0]\n"
- "add x14, x14, #0x10\n"
- "prfm pldl1keep, [x14, #0x80]\n"
"fmla v28.4s, v12.4s, v4.s[0]\n"
- "add x12, x12, #0x10\n"
"fmla v29.4s, v12.4s, v5.s[0]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "add x10, x10, #0x10\n"
"fmla v30.4s, v12.4s, v6.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
"fmla v31.4s, v12.4s, v7.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- "fmla v24.4s, v13.4s, v0.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x24, x24, #0x10\n"
- "fmla v25.4s, v13.4s, v1.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "add x22, x22, #0x10\n"
- "fmla v26.4s, v13.4s, v2.s[1]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "add x20, x20, #0x10\n"
- "fmla v27.4s, v13.4s, v3.s[1]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "add x7, x7, #0x40\n"
- "fmla v28.4s, v13.4s, v4.s[1]\n"
- "fmla v29.4s, v13.4s, v5.s[1]\n"
- "fmla v30.4s, v13.4s, v6.s[1]\n"
- "fmla v31.4s, v13.4s, v7.s[1]\n"
- "fmla v24.4s, v14.4s, v0.s[2]\n"
- "fmla v25.4s, v14.4s, v1.s[2]\n"
- "fmla v26.4s, v14.4s, v2.s[2]\n"
- "fmla v27.4s, v14.4s, v3.s[2]\n"
- "fmla v28.4s, v14.4s, v4.s[2]\n"
- "fmla v29.4s, v14.4s, v5.s[2]\n"
- "fmla v30.4s, v14.4s, v6.s[2]\n"
- "fmla v31.4s, v14.4s, v7.s[2]\n"
- "fmla v24.4s, v15.4s, v0.s[3]\n"
- "fmla v25.4s, v15.4s, v1.s[3]\n"
- "fmla v26.4s, v15.4s, v2.s[3]\n"
- "fmla v27.4s, v15.4s, v3.s[3]\n"
- "fmla v28.4s, v15.4s, v4.s[3]\n"
- "fmla v29.4s, v15.4s, v5.s[3]\n"
- "fmla v30.4s, v15.4s, v6.s[3]\n"
- "fmla v31.4s, v15.4s, v7.s[3]\n"
- "169:" // Height 8: Multiply loop: Main loop skip
- "cbz x15, 171f\n"
- "170:" // Height 8: Multiply loop: Odd block loop
- "ldr s0, [x14], #0x4\n"
- "ldr s1, [x12], #0x4\n"
- "ldr s2, [x10], #0x4\n"
- "ldr s3, [x28], #0x4\n"
- "ldr s4, [x26], #0x4\n"
- "ldr s5, [x24], #0x4\n"
- "ldr s6, [x22], #0x4\n"
- "ldr s7, [x20], #0x4\n"
- "ldr q16, [x7, #0x0]\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "sub x15, x15, #0x1\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add x7, x7, #0x10\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "fmla v31.4s, v16.4s, v7.s[0]\n"
- "cbnz x15, 170b\n"
- "171:" // Height 8: Multiply loop: No odd multiplies
+ "cbnz x28, 162b\n"
+ "163:" // Height 8: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x16, x16, #0x1\n"
- "cmp x16, x19\n"
- "bne 164b\n"
- "prfm pstl1keep, [x17, #0x0]\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x11, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x27, #0x0]\n"
+ "add x9, x9, #0x1\n"
+ "cmp x9, x19\n"
+ "bne 156b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x10, #0x0]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x25, x26, x19, LSL #2\n"
"prfm pstl1keep, [x25, #0x0]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "prfm pstl1keep, [x24, #0x0]\n"
+ "add x23, x24, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "tbz %x[flags], #1, 172f\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "tbz %x[flags], #1, 164f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1r { v17.4s }, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -2120,75 +2010,68 @@ void a64_hybrid_fp32_mla_8x4 (
"fmax v30.4s, v30.4s, v17.4s\n"
"fmin v31.4s, v31.4s, v16.4s\n"
"fmax v31.4s, v31.4s, v17.4s\n"
- "172:" // Height 8: No activation
- "cmp x6, #0x4\n"
- "bge 175f\n"
- "tbz x6, #1, 173f\n"
- "str d24, [x17], #0x8\n"
- "str d25, [x13], #0x8\n"
- "str d26, [x11], #0x8\n"
- "str d27, [x9], #0x8\n"
- "str d28, [x27], #0x8\n"
- "str d29, [x25], #0x8\n"
- "str d30, [x23], #0x8\n"
- "str d31, [x21], #0x8\n"
- "tbz x6, #0, 174f\n"
- "st1 { v24.s }[2], [x17]\n"
- "st1 { v25.s }[2], [x13]\n"
- "st1 { v26.s }[2], [x11]\n"
- "st1 { v27.s }[2], [x9]\n"
- "st1 { v28.s }[2], [x27]\n"
- "st1 { v29.s }[2], [x25]\n"
- "st1 { v30.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x21]\n"
- "b 174f\n"
- "173:" // Height 8: Partial direct writeback: partial_1_0
- "str s24, [x17, #0x0]\n"
- "str s25, [x13, #0x0]\n"
- "str s26, [x11, #0x0]\n"
- "str s27, [x9, #0x0]\n"
- "str s28, [x27, #0x0]\n"
- "str s29, [x25, #0x0]\n"
- "str s30, [x23, #0x0]\n"
- "str s31, [x21, #0x0]\n"
- "174:" // Height 8: Partial direct writeback: Done
- "b 176f\n"
- "175:" // Height 8: Full writeback
- "str q24, [x17, #0x0]\n"
- "str q25, [x13, #0x0]\n"
- "str q26, [x11, #0x0]\n"
- "str q27, [x9, #0x0]\n"
- "str q28, [x27, #0x0]\n"
- "str q29, [x25, #0x0]\n"
- "str q30, [x23, #0x0]\n"
- "str q31, [x21, #0x0]\n"
- "add x17, x17, #0x10\n"
- "add x13, x13, #0x10\n"
- "add x11, x11, #0x10\n"
- "add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x21, x21, #0x10\n"
- "176:" // Height 8: Writeback done
- "subs x6, x6, #0x4\n"
- "bgt 157b\n"
+ "164:" // Height 8: No activation
+ "cmp x13, #0x4\n"
+ "bge 167f\n"
+ "tbz x13, #1, 165f\n"
+ "str d24, [x10], #0x8\n"
+ "str d25, [x26], #0x8\n"
+ "str d26, [x25], #0x8\n"
+ "str d27, [x24], #0x8\n"
+ "str d28, [x23], #0x8\n"
+ "str d29, [x22], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "str d31, [x20], #0x8\n"
+ "tbz x13, #0, 166f\n"
+ "st1 { v24.s }[2], [x10]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 166f\n"
+ "165:" // Height 8: Partial direct writeback: partial_1_0
+ "str s24, [x10, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
+ "166:" // Height 8: Partial direct writeback: Done
+ "b 168f\n"
+ "167:" // Height 8: Full writeback
+ "str q24, [x10, #0x0]\n"
+ "add x10, x10, #0x10\n"
+ "str q25, [x26, #0x0]\n"
+ "str q26, [x25, #0x0]\n"
+ "str q27, [x24, #0x0]\n"
+ "str q28, [x23, #0x0]\n"
+ "str q29, [x22, #0x0]\n"
+ "str q30, [x21, #0x0]\n"
+ "str q31, [x20, #0x0]\n"
+ "168:" // Height 8: Writeback done
+ "subs x13, x13, #0x4\n"
+ "bgt 149b\n"
"subs %x[M], %x[M], #0x8\n"
- "beq 178f\n"
+ "beq 170f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 177f\n"
+ "tbz %x[flags], #3, 169f\n"
"add x20, x20, #0x8\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "177:" // Update direct input
+ "169:" // Update direct input
"mov x19, #0x20\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "178:" // Exit
+ "170:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index 5f14072f0a..caef6396be 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -37,9 +37,9 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void a64_hybrid_s8qa_dot_4x16( ARGLIST );
+void a64_hybrid_s8qa_dot_4x16_a55( ARGLIST );
class cls_a64_hybrid_s8qa_dot_4x16
{
@@ -72,7 +72,8 @@ public:
StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
- static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+ {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
return { 7.5301 };
@@ -83,9 +84,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_s8qa_dot_4x16;
-
- cls_a64_hybrid_s8qa_dot_4x16(const CPUInfo *)
+ cls_a64_hybrid_s8qa_dot_4x16(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A55r1:
+ kernel=a64_hybrid_s8qa_dot_4x16_a55;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
new file mode 100644
index 0000000000..fb85506ee8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
@@ -0,0 +1,2148 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qa_dot_4x16_a55 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 91f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 61f\n"
+ "beq 31f\n"
+ "movi v11.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[col_bias]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "3:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "4:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 5f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 6f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x25, x25, x19\n"
+ "b 6f\n"
+ "5:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "6:" // Height 1: input setup done
+ "cmp x26, #0x10\n"
+ "blt 11f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "blt 9f\n"
+ "7:" // Height 1: Multiply loop: Main loop head
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr d5, [x10, #0x10]\n"
+ "ldr x24, [x10, #0x18]\n"
+ "add x25, x25, #0x10\n"
+ "ldr d6, [x10, #0x20]\n"
+ "ldr x23, [x10, #0x28]\n"
+ "mov v5.d[1], x24\n"
+ "ldr d7, [x10, #0x30]\n"
+ "ldr x19, [x10, #0x38]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr d8, [x10, #0x40]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr x23, [x10, #0x48]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr d9, [x10, #0x50]\n"
+ "ldr x19, [x10, #0x58]\n"
+ "mov v8.d[1], x23\n"
+ "ldr d10, [x10, #0x60]\n"
+ "ldr x23, [x10, #0x68]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr d4, [x10, #0x70]\n"
+ "mov v10.d[1], x23\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr x19, [x10, #0x78]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ "ldr d5, [x10, #0x80]\n"
+ "ldr x24, [x10, #0x88]\n"
+ "mov v4.d[1], x19\n"
+ "ldr d6, [x10, #0x90]\n"
+ "ldr x23, [x10, #0x98]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr d7, [x10, #0xa0]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ "ldr x19, [x10, #0xa8]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr d8, [x10, #0xb0]\n"
+ "ldr x23, [x10, #0xb8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d9, [x10, #0xc0]\n"
+ "ldr x19, [x10, #0xc8]\n"
+ "mov v8.d[1], x23\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ "ldr d10, [x10, #0xd0]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ "ldr x23, [x10, #0xd8]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ "ldr d4, [x10, #0xe0]\n"
+ "ldr x19, [x10, #0xe8]\n"
+ "mov v10.d[1], x23\n"
+ "ldr d5, [x10, #0xf0]\n"
+ "ldr x24, [x10, #0xf8]\n"
+ "add x10, x10, #0x100\n"
+ "mov v4.d[1], x19\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 8f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "8:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q0, [x25, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q4, [x10, #0x0]\n"
+ "bge 7b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x10, #0x10]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q8, [x10, #0x40]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q9, [x10, #0x50]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x10, #0x60]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q4, [x10, #0x70]\n"
+ "ldr q5, [x10, #0x80]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q6, [x10, #0x90]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ "ldr q7, [x10, #0xa0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr q8, [x10, #0xb0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ "ldr q9, [x10, #0xc0]\n"
+ "ldr q10, [x10, #0xd0]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q4, [x10, #0xe0]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ "ldr q5, [x10, #0xf0]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 10f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "10:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "11:" // Height 1: Multiply loop: Main loop skip
+ "cbz x26, 18f\n"
+ "cmp x26, #0x4\n"
+ "blt 14f\n"
+ "12:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "tbnz %x[flags], #31, 13f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "13:" // Height 1: Multiply loop: unique 3: skip row sum
+ "ldr q6, [x10, #0x0]\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q7, [x10, #0x10]\n"
+ "cmp x26, #0x4\n"
+ "ldr q8, [x10, #0x20]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q9, [x10, #0x30]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ "bge 12b\n"
+ "cbz x26, 18f\n"
+ "14:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 15f\n"
+ "ldr h0, [x25], #0x2\n"
+ "tbz x26, #0, 16f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "b 16f\n"
+ "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 17f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "17:" // Height 1: Multiply loop: unique 4: skip row sum
+ "ldr q10, [x10, #0x0]\n"
+ "ldr q4, [x10, #0x10]\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
+ ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
+ "18:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 4b\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "tbnz %x[flags], #31, 19f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v1.4s }, [x22]\n"
+ "neg v1.4s, v1.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "mul v11.4s, v11.4s, v1.4s\n"
+ "19:" // Height 1: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q0, [x9, #0x0]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "ldr q1, [x9, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ldr q2, [x9, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "ldr q3, [x9, #0x30]\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add x9, x9, #0x40\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "tbz %x[flags], #5, 20f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "20:" // Height 1: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "add x23, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x23]\n"
+ "cmp x11, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "bge 29f\n"
+ "tbz x11, #3, 24f\n"
+ "str d16, [x28], #0x8\n"
+ "tbz x11, #2, 22f\n"
+ "st1 { v16.s }[2], [x28], #0x4\n"
+ "tbz x11, #1, 21f\n"
+ "st1 { v16.h }[6], [x28], #0x2\n"
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[14], [x28]\n"
+ "b 28f\n"
+ "21:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[12], [x28]\n"
+ "b 28f\n"
+ "22:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 23f\n"
+ "st1 { v16.h }[4], [x28], #0x2\n"
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[10], [x28]\n"
+ "b 28f\n"
+ "23:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[8], [x28]\n"
+ "b 28f\n"
+ "24:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 26f\n"
+ "str s16, [x28], #0x4\n"
+ "tbz x11, #1, 25f\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[6], [x28]\n"
+ "b 28f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[4], [x28]\n"
+ "b 28f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 27f\n"
+ "str h16, [x28], #0x2\n"
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[2], [x28]\n"
+ "b 28f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_0
+ "str b16, [x28, #0x0]\n"
+ "28:" // Height 1: Partial direct writeback: Done
+ "b 30f\n"
+ "29:" // Height 1: Full writeback
+ "str q16, [x28, #0x0]\n"
+ "add x28, x28, #0x10\n"
+ "30:" // Height 1: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 2b\n"
+ "b 122f\n"
+ "31:" // Height 2
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[col_bias]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x28, %x[output_ptr]\n"
+ "32:" // Height 2: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "33:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "34:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 35f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "cbnz x27, 36f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x25, x25, x19\n"
+ "add x22, x22, x19\n"
+ "b 36f\n"
+ "35:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x22, x25, x19\n"
+ "36:" // Height 2: input setup done
+ "cmp x26, #0x10\n"
+ "blt 41f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q4, [x10, #0x0]\n"
+ "blt 39f\n"
+ "37:" // Height 2: Multiply loop: Main loop head
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr d5, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
+ "ldr x24, [x10, #0x18]\n"
+ "add x22, x22, #0x10\n"
+ "ldr d6, [x10, #0x20]\n"
+ "ldr x23, [x10, #0x28]\n"
+ "mov v5.d[1], x24\n"
+ "ldr d7, [x10, #0x30]\n"
+ "ldr x19, [x10, #0x38]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr d8, [x10, #0x40]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr x23, [x10, #0x48]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr d9, [x10, #0x50]\n"
+ "mov v8.d[1], x23\n"
+ "ldr x19, [x10, #0x58]\n"
+ "ldr d10, [x10, #0x60]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ "mov v9.d[1], x19\n"
+ "ldr x23, [x10, #0x68]\n"
+ "ldr d4, [x10, #0x70]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ "mov v10.d[1], x23\n"
+ "ldr x19, [x10, #0x78]\n"
+ "ldr d5, [x10, #0x80]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ "mov v4.d[1], x19\n"
+ "ldr x24, [x10, #0x88]\n"
+ "ldr d6, [x10, #0x90]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ "mov v5.d[1], x24\n"
+ "ldr x23, [x10, #0x98]\n"
+ "ldr d7, [x10, #0xa0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ "mov v6.d[1], x23\n"
+ "ldr x19, [x10, #0xa8]\n"
+ "ldr d8, [x10, #0xb0]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x23, [x10, #0xb8]\n"
+ "ldr d9, [x10, #0xc0]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ "mov v8.d[1], x23\n"
+ "ldr x19, [x10, #0xc8]\n"
+ "ldr d10, [x10, #0xd0]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ "mov v9.d[1], x19\n"
+ "ldr x23, [x10, #0xd8]\n"
+ "ldr d4, [x10, #0xe0]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ "mov v10.d[1], x23\n"
+ "ldr x19, [x10, #0xe8]\n"
+ "ldr d5, [x10, #0xf0]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ "mov v4.d[1], x19\n"
+ "ldr x24, [x10, #0xf8]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 38f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "38:" // Height 2: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x20\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "bge 37b\n"
+ "39:" // Height 2: Multiply loop: Single iteration only
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q5, [x10, #0x10]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr q8, [x10, #0x40]\n"
+ "ldr q9, [x10, #0x50]\n"
+ "ldr q10, [x10, #0x60]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ "ldr q4, [x10, #0x70]\n"
+ "ldr q5, [x10, #0x80]\n"
+ "ldr q6, [x10, #0x90]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ "ldr q7, [x10, #0xa0]\n"
+ "ldr q8, [x10, #0xb0]\n"
+ "ldr q9, [x10, #0xc0]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ "ldr q10, [x10, #0xd0]\n"
+ "ldr q4, [x10, #0xe0]\n"
+ "ldr q5, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 40f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "40:" // Height 2: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "41:" // Height 2: Multiply loop: Main loop skip
+ "cbz x26, 48f\n"
+ "cmp x26, #0x4\n"
+ "blt 44f\n"
+ "42:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "tbnz %x[flags], #31, 43f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "43:" // Height 2: Multiply loop: unique 7: skip row sum
+ "ldr q6, [x10, #0x0]\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q7, [x10, #0x10]\n"
+ "cmp x26, #0x4\n"
+ "ldr q8, [x10, #0x20]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ "ldr q9, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ "bge 42b\n"
+ "cbz x26, 48f\n"
+ "44:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 45f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "tbz x26, #0, 46f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "b 46f\n"
+ "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "46:" // Height 2: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 47f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "47:" // Height 2: Multiply loop: unique 8: skip row sum
+ "ldr q10, [x10, #0x0]\n"
+ "ldr q4, [x10, #0x10]\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
+ "48:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 34b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x21, x28, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 49f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x22]\n"
+ "neg v2.4s, v2.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "mul v11.4s, v11.4s, v2.4s\n"
+ "mul v12.4s, v12.4s, v2.4s\n"
+ "49:" // Height 2: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "ldr q0, [x9, #0x0]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "ldr q1, [x9, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ldr q2, [x9, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "ldr q3, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "tbz %x[flags], #5, 50f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "50:" // Height 2: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "add x23, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x23]\n"
+ "cmp x11, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "bge 59f\n"
+ "tbz x11, #3, 54f\n"
+ "str d16, [x28], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x11, #2, 52f\n"
+ "st1 { v16.s }[2], [x28], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "tbz x11, #1, 51f\n"
+ "st1 { v16.h }[6], [x28], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[14], [x28]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "b 58f\n"
+ "51:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[12], [x28]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "b 58f\n"
+ "52:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 53f\n"
+ "st1 { v16.h }[4], [x28], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[10], [x28]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "b 58f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[8], [x28]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "b 58f\n"
+ "54:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 56f\n"
+ "str s16, [x28], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "tbz x11, #1, 55f\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[6], [x28]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "b 58f\n"
+ "55:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[4], [x28]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "b 58f\n"
+ "56:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 57f\n"
+ "str h16, [x28], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[2], [x28]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "b 58f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_0
+ "str b16, [x28, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "58:" // Height 2: Partial direct writeback: Done
+ "b 60f\n"
+ "59:" // Height 2: Full writeback
+ "str q16, [x28, #0x0]\n"
+ "add x28, x28, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "60:" // Height 2: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 32b\n"
+ "b 122f\n"
+ "61:" // Height 3
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[col_bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x28, %x[output_ptr]\n"
+ "62:" // Height 3: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "63:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "64:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 65f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
+ "cbnz x27, 66f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x25, x25, x19\n"
+ "add x22, x22, x19\n"
+ "add x21, x21, x19\n"
+ "b 66f\n"
+ "65:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x22, x25, x19\n"
+ "add x21, x22, x19\n"
+ "66:" // Height 3: input setup done
+ "cmp x26, #0x10\n"
+ "blt 71f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "blt 69f\n"
+ "67:" // Height 3: Multiply loop: Main loop head
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr d5, [x10, #0x10]\n"
+ "ldr x24, [x10, #0x18]\n"
+ "add x25, x25, #0x10\n"
+ "ldr d6, [x10, #0x20]\n"
+ "add x22, x22, #0x10\n"
+ "ldr x23, [x10, #0x28]\n"
+ "add x21, x21, #0x10\n"
+ "mov v5.d[1], x24\n"
+ "ldr d7, [x10, #0x30]\n"
+ "ldr x19, [x10, #0x38]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr d8, [x10, #0x40]\n"
+ "ldr x23, [x10, #0x48]\n"
+ "ldr d9, [x10, #0x50]\n"
+ "ldr x19, [x10, #0x58]\n"
+ "mov v8.d[1], x23\n"
+ "ldr d10, [x10, #0x60]\n"
+ "ldr x23, [x10, #0x68]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ "mov v10.d[1], x23\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr d4, [x10, #0x70]\n"
+ "ldr x19, [x10, #0x78]\n"
+ "ldr d5, [x10, #0x80]\n"
+ "ldr x24, [x10, #0x88]\n"
+ "mov v4.d[1], x19\n"
+ "ldr d6, [x10, #0x90]\n"
+ "ldr x23, [x10, #0x98]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ "ldr d7, [x10, #0xa0]\n"
+ "ldr x19, [x10, #0xa8]\n"
+ "ldr d8, [x10, #0xb0]\n"
+ "ldr x23, [x10, #0xb8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d9, [x10, #0xc0]\n"
+ "ldr x19, [x10, #0xc8]\n"
+ "mov v8.d[1], x23\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ "ldr d10, [x10, #0xd0]\n"
+ "ldr x23, [x10, #0xd8]\n"
+ "ldr d4, [x10, #0xe0]\n"
+ "ldr x19, [x10, #0xe8]\n"
+ "mov v10.d[1], x23\n"
+ "ldr d5, [x10, #0xf0]\n"
+ "ldr x24, [x10, #0xf8]\n"
+ "add x10, x10, #0x100\n"
+ "mov v4.d[1], x19\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 68f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "68:" // Height 3: Multiply loop: unique 9: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x20\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "bge 67b\n"
+ "69:" // Height 3: Multiply loop: Single iteration only
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q5, [x10, #0x10]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr q8, [x10, #0x40]\n"
+ "ldr q9, [x10, #0x50]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q10, [x10, #0x60]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr q4, [x10, #0x70]\n"
+ "ldr q5, [x10, #0x80]\n"
+ "ldr q6, [x10, #0x90]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ "ldr q7, [x10, #0xa0]\n"
+ "ldr q8, [x10, #0xb0]\n"
+ "ldr q9, [x10, #0xc0]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ "ldr q10, [x10, #0xd0]\n"
+ "ldr q4, [x10, #0xe0]\n"
+ "ldr q5, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 70f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "70:" // Height 3: Multiply loop: unique 10: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "71:" // Height 3: Multiply loop: Main loop skip
+ "cbz x26, 78f\n"
+ "cmp x26, #0x4\n"
+ "blt 74f\n"
+ "72:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "tbnz %x[flags], #31, 73f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "73:" // Height 3: Multiply loop: unique 11: skip row sum
+ "ldr q6, [x10, #0x0]\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q7, [x10, #0x10]\n"
+ "cmp x26, #0x4\n"
+ "ldr q8, [x10, #0x20]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
+ "ldr q9, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
+ "bge 72b\n"
+ "cbz x26, 78f\n"
+ "74:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 75f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
+ "tbz x26, #0, 76f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v2.b }[2], [x21]\n"
+ "b 76f\n"
+ "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "ldr b2, [x21, #0x0]\n"
+ "76:" // Height 3: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 77f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "77:" // Height 3: Multiply loop: unique 12: skip row sum
+ "ldr q10, [x10, #0x0]\n"
+ "ldr q4, [x10, #0x10]\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n"
+ "78:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 64b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x21, x28, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "tbnz %x[flags], #31, 79f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v3.4s }, [x22]\n"
+ "neg v3.4s, v3.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "mul v11.4s, v11.4s, v3.4s\n"
+ "mul v12.4s, v12.4s, v3.4s\n"
+ "mul v13.4s, v13.4s, v3.4s\n"
+ "79:" // Height 3: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "ldr q0, [x9, #0x0]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "ldr q1, [x9, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ldr q2, [x9, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "ldr q3, [x9, #0x30]\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "add x9, x9, #0x40\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "tbz %x[flags], #5, 80f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "80:" // Height 3: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "add x23, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x23]\n"
+ "cmp x11, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "bge 89f\n"
+ "tbz x11, #3, 84f\n"
+ "str d16, [x28], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x11, #2, 82f\n"
+ "st1 { v16.s }[2], [x28], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "tbz x11, #1, 81f\n"
+ "st1 { v16.h }[6], [x28], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[14], [x28]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "b 88f\n"
+ "81:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[12], [x28]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "b 88f\n"
+ "82:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 83f\n"
+ "st1 { v16.h }[4], [x28], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[10], [x28]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "b 88f\n"
+ "83:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[8], [x28]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "b 88f\n"
+ "84:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 86f\n"
+ "str s16, [x28], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "tbz x11, #1, 85f\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[6], [x28]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "b 88f\n"
+ "85:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[4], [x28]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "b 88f\n"
+ "86:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 87f\n"
+ "str h16, [x28], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[2], [x28]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "b 88f\n"
+ "87:" // Height 3: Partial direct writeback: partial_1_0
+ "str b16, [x28, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "88:" // Height 3: Partial direct writeback: Done
+ "b 90f\n"
+ "89:" // Height 3: Full writeback
+ "str q16, [x28, #0x0]\n"
+ "add x28, x28, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "90:" // Height 3: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 62b\n"
+ "b 122f\n"
+ "91:" // Height 4
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[col_bias]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x28, %x[output_ptr]\n"
+ "mov x19, #0x4\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "92:" // Height 4: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "93:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "94:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 95f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x27, 96f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x25, x25, x19\n"
+ "add x22, x22, x19\n"
+ "add x21, x21, x19\n"
+ "add x20, x20, x19\n"
+ "b 96f\n"
+ "95:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x22, x25, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "96:" // Height 4: input setup done
+ "cmp x26, #0x10\n"
+ "blt 101f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "blt 99f\n"
+ "97:" // Height 4: Multiply loop: Main loop head
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr d5, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
+ "ldr x24, [x10, #0x18]\n"
+ "add x22, x22, #0x10\n"
+ "ldr d6, [x10, #0x20]\n"
+ "add x21, x21, #0x10\n"
+ "ldr x23, [x10, #0x28]\n"
+ "add x20, x20, #0x10\n"
+ "mov v5.d[1], x24\n"
+ "ldr d7, [x10, #0x30]\n"
+ "ldr x19, [x10, #0x38]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr d8, [x10, #0x40]\n"
+ "ldr x23, [x10, #0x48]\n"
+ "ldr d9, [x10, #0x50]\n"
+ "ldr x19, [x10, #0x58]\n"
+ "mov v8.d[1], x23\n"
+ "ldr d10, [x10, #0x60]\n"
+ "ldr x23, [x10, #0x68]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ "mov v10.d[1], x23\n"
+ ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr d4, [x10, #0x70]\n"
+ "ldr x19, [x10, #0x78]\n"
+ "ldr d5, [x10, #0x80]\n"
+ "ldr x24, [x10, #0x88]\n"
+ "mov v4.d[1], x19\n"
+ "ldr d6, [x10, #0x90]\n"
+ "ldr x23, [x10, #0x98]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
+ "ldr d7, [x10, #0xa0]\n"
+ "ldr x19, [x10, #0xa8]\n"
+ "ldr d8, [x10, #0xb0]\n"
+ "ldr x23, [x10, #0xb8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d9, [x10, #0xc0]\n"
+ "ldr x19, [x10, #0xc8]\n"
+ "mov v8.d[1], x23\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x4f83e8fe // sdot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e93c // sdot v28.4s, v9.16b, v3.4b[3]\n"
+ "ldr d10, [x10, #0xd0]\n"
+ "ldr x23, [x10, #0xd8]\n"
+ "ldr d4, [x10, #0xe0]\n"
+ "ldr x19, [x10, #0xe8]\n"
+ "mov v10.d[1], x23\n"
+ "ldr d5, [x10, #0xf0]\n"
+ "ldr x24, [x10, #0xf8]\n"
+ "add x10, x10, #0x100\n"
+ "mov v4.d[1], x19\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x4fa3e95d // sdot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e89e // sdot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 98f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "98:" // Height 4: Multiply loop: unique 13: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x20\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "bge 97b\n"
+ "99:" // Height 4: Multiply loop: Single iteration only
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr q5, [x10, #0x10]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr q8, [x10, #0x40]\n"
+ "add x21, x21, #0x10\n"
+ "ldr q9, [x10, #0x50]\n"
+ "add x20, x20, #0x10\n"
+ "ldr q10, [x10, #0x60]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr q4, [x10, #0x70]\n"
+ "ldr q5, [x10, #0x80]\n"
+ "ldr q6, [x10, #0x90]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
+ "ldr q7, [x10, #0xa0]\n"
+ "ldr q8, [x10, #0xb0]\n"
+ "ldr q9, [x10, #0xc0]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8fe // sdot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e93c // sdot v28.4s, v9.16b, v3.4b[3]\n"
+ "ldr q10, [x10, #0xd0]\n"
+ "ldr q4, [x10, #0xe0]\n"
+ "ldr q5, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e95d // sdot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e89e // sdot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 100f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "100:" // Height 4: Multiply loop: unique 14: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "101:" // Height 4: Multiply loop: Main loop skip
+ "cbz x26, 108f\n"
+ "cmp x26, #0x4\n"
+ "blt 104f\n"
+ "102:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr s3, [x20], #0x4\n"
+ "tbnz %x[flags], #31, 103f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "103:" // Height 4: Multiply loop: unique 15: skip row sum
+ "ldr q6, [x10, #0x0]\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q7, [x10, #0x10]\n"
+ "cmp x26, #0x4\n"
+ "ldr q8, [x10, #0x20]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
+ "ldr q9, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n"
+ "bge 102b\n"
+ "cbz x26, 108f\n"
+ "104:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 105f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
+ "ldr h3, [x20], #0x2\n"
+ "tbz x26, #0, 106f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v2.b }[2], [x21]\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 106f\n"
+ "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "ldr b2, [x21, #0x0]\n"
+ "ldr b3, [x20, #0x0]\n"
+ "106:" // Height 4: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 107f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "107:" // Height 4: Multiply loop: unique 16: skip row sum
+ "ldr q10, [x10, #0x0]\n"
+ "ldr q4, [x10, #0x10]\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n"
+ "108:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 94b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x21, x28, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
+ "tbnz %x[flags], #31, 109f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "neg v4.4s, v4.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "mul v11.4s, v11.4s, v4.4s\n"
+ "mul v12.4s, v12.4s, v4.4s\n"
+ "mul v13.4s, v13.4s, v4.4s\n"
+ "mul v14.4s, v14.4s, v4.4s\n"
+ "109:" // Height 4: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "ldr q0, [x9, #0x0]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "ldr q1, [x9, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ldr q2, [x9, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
+ "ldr q3, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "tbz %x[flags], #5, 110f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "and v9.16b, v28.16b, v0.16b\n"
+ "and v10.16b, v29.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v4.16b, v30.16b, v0.16b\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "and v5.16b, v31.16b, v0.16b\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "sqadd v28.4s, v28.4s, v9.4s\n"
+ "sqadd v29.4s, v29.4s, v10.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v5.4s\n"
+ "110:" // Height 4: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "srshl v28.4s, v28.4s, v0.4s\n"
+ "srshl v29.4s, v29.4s, v0.4s\n"
+ "srshl v30.4s, v30.4s, v0.4s\n"
+ "srshl v31.4s, v31.4s, v0.4s\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "add x23, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x23]\n"
+ "cmp x11, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "smin v28.4s, v28.4s, v6.4s\n"
+ "smin v29.4s, v29.4s, v6.4s\n"
+ "smin v30.4s, v30.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v6.4s\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "smax v28.4s, v28.4s, v5.4s\n"
+ "smax v29.4s, v29.4s, v5.4s\n"
+ "smax v30.4s, v30.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v5.4s\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v28.8h, v28.8h, v29.8h\n"
+ "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "bge 119f\n"
+ "tbz x11, #3, 114f\n"
+ "str d16, [x28], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x11, #2, 112f\n"
+ "st1 { v16.s }[2], [x28], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "st1 { v28.s }[2], [x19], #0x4\n"
+ "tbz x11, #1, 111f\n"
+ "st1 { v16.h }[6], [x28], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "st1 { v28.h }[6], [x19], #0x2\n"
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[14], [x28]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "st1 { v28.b }[14], [x19]\n"
+ "b 118f\n"
+ "111:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[12], [x28]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "st1 { v28.b }[12], [x19]\n"
+ "b 118f\n"
+ "112:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 113f\n"
+ "st1 { v16.h }[4], [x28], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "st1 { v28.h }[4], [x19], #0x2\n"
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[10], [x28]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "st1 { v28.b }[10], [x19]\n"
+ "b 118f\n"
+ "113:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[8], [x28]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "st1 { v28.b }[8], [x19]\n"
+ "b 118f\n"
+ "114:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 116f\n"
+ "str s16, [x28], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "str s28, [x19], #0x4\n"
+ "tbz x11, #1, 115f\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "st1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[6], [x28]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "st1 { v28.b }[6], [x19]\n"
+ "b 118f\n"
+ "115:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[4], [x28]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "st1 { v28.b }[4], [x19]\n"
+ "b 118f\n"
+ "116:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 117f\n"
+ "str h16, [x28], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "str h28, [x19], #0x2\n"
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[2], [x28]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "st1 { v28.b }[2], [x19]\n"
+ "b 118f\n"
+ "117:" // Height 4: Partial direct writeback: partial_1_0
+ "str b16, [x28, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "str b28, [x19, #0x0]\n"
+ "118:" // Height 4: Partial direct writeback: Done
+ "b 120f\n"
+ "119:" // Height 4: Full writeback
+ "str q16, [x28, #0x0]\n"
+ "add x28, x28, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q28, [x19, #0x0]\n"
+ "120:" // Height 4: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 92b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 122f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 121f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "121:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "122:" // Exit
+
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
index 17575bd611..0adfb99f23 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -81,216 +81,207 @@ void a64_hybrid_s8qa_dot_4x16 (
"1:" // Row loop
"cmp %x[M], #0x4\n"
- "bge 94f\n"
+ "bge 91f\n"
"cmp %x[M], #0x2\n"
- "bgt 63f\n"
- "beq 32f\n"
+ "bgt 61f\n"
+ "beq 31f\n"
"movi v11.4s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "movi v12.4s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x10, %x[col_bias]\n"
- "movi v13.4s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "movi v14.4s, #0x0\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"movi v15.16b, #0x1\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "add x9, x9, x19\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x9, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[col_bias]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x26, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
"movi v19.4s, #0x0\n"
- "4:" // Height 1: setup done
- "mov x28, #0x0\n"
- "5:" // Height 1: String loop
+ "3:" // Height 1: setup done
+ "mov x25, #0x0\n"
+ "4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 6f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 5f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "cbnz x28, 7f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "cbnz x25, 6f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "b 7f\n"
- "6:" // Height 1: setup direct input
- "mov x26, %x[input_ptr]\n"
- "7:" // Height 1: input setup done
- "cmp x27, #0x10\n"
- "blt 12f\n"
- "cmp x27, #0x20\n"
- "blt 10f\n"
- "8:" // Height 1: Multiply loop: Main loop head
- "ldr q0, [x26, #0x0]\n"
- "ldr q4, [x11, #0x0]\n"
+ "add x23, x23, x19\n"
+ "b 6f\n"
+ "5:" // Height 1: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "6:" // Height 1: input setup done
+ "cmp x24, #0x10\n"
+ "blt 11f\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "blt 9f\n"
+ "7:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x10]\n"
- "ldr q6, [x11, #0x20]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x30]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q8, [x11, #0x40]\n"
- "ldr q9, [x11, #0x50]\n"
+ "ldr q8, [x28, #0x40]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x60]\n"
- "ldr q4, [x11, #0x70]\n"
+ "ldr q9, [x28, #0x50]\n"
+ "ldr q10, [x28, #0x60]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q4, [x28, #0x70]\n"
+ "ldr q5, [x28, #0x80]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q5, [x11, #0x80]\n"
- "ldr q6, [x11, #0x90]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q7, [x11, #0xa0]\n"
+ "ldr q6, [x28, #0x90]\n"
+ "ldr q7, [x28, #0xa0]\n"
".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr q8, [x11, #0xb0]\n"
- "ldr q9, [x11, #0xc0]\n"
+ "ldr q8, [x28, #0xb0]\n"
".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ "ldr q10, [x28, #0xd0]\n"
".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "ldr q10, [x11, #0xd0]\n"
- "ldr q4, [x11, #0xe0]\n"
".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
- "ldr q5, [x11, #0xf0]\n"
- "add x26, x26, #0x10\n"
+ "ldr q4, [x28, #0xe0]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ "add x28, x28, #0x100\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 8f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "8:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "ldr q0, [x23, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "ldr q4, [x28, #0x0]\n"
+ "bge 7b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "sub x24, x24, #0x10\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "ldr q8, [x28, #0x40]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q9, [x28, #0x50]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q10, [x28, #0x60]\n"
+ "ldr q4, [x28, #0x70]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q5, [x28, #0x80]\n"
+ "ldr q6, [x28, #0x90]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr q7, [x28, #0xa0]\n"
+ "ldr q8, [x28, #0xb0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q10, [x28, #0xd0]\n"
+ "ldr q4, [x28, #0xe0]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
- "add x11, x11, #0x100\n"
".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- "tbnz %x[flags], #31, 9f\n"
+ "tbnz %x[flags], #31, 10f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- "9:" // Height 1: Multiply loop: unique 1: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x20\n"
- "bge 8b\n"
- "10:" // Height 1: Multiply loop: Single iteration only
- "sub x27, x27, #0x10\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q6, [x11, #0x0]\n"
+ "10:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "11:" // Height 1: Multiply loop: Main loop skip
+ "cbz x24, 18f\n"
+ "cmp x24, #0x4\n"
+ "blt 14f\n"
+ "12:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x23], #0x4\n"
+ "tbnz %x[flags], #31, 13f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "13:" // Height 1: Multiply loop: unique 3: skip row sum
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "ldr q8, [x11, #0x20]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x24, x24, #0x4\n"
".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x30]\n"
+ "ldr q8, [x28, #0x20]\n"
+ "cmp x24, #0x4\n"
".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x40]\n"
- "ldr q4, [x11, #0x50]\n"
+ "ldr q9, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x60]\n"
- "ldr q6, [x11, #0x70]\n"
- ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
- "ldr q7, [x11, #0x80]\n"
- "ldr q8, [x11, #0x90]\n"
- ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
- "ldr q9, [x11, #0xa0]\n"
- ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
- "ldr q10, [x11, #0xb0]\n"
- "ldr q4, [x11, #0xc0]\n"
- ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
- "ldr q5, [x11, #0xd0]\n"
- "ldr q6, [x11, #0xe0]\n"
- ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
- "ldr q7, [x11, #0xf0]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
- "add x11, x11, #0x100\n"
- ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
- "tbnz %x[flags], #31, 11f\n"
+ "bge 12b\n"
+ "cbz x24, 18f\n"
+ "14:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x24, #1, 15f\n"
+ "ldr h0, [x23], #0x2\n"
+ "tbz x24, #0, 16f\n"
+ "ld1 { v0.b }[2], [x23]\n"
+ "b 16f\n"
+ "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x23, #0x0]\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 17f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- "11:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "12:" // Height 1: Multiply loop: Main loop skip
- "cbz x27, 19f\n"
- "cmp x27, #0x4\n"
- "blt 15f\n"
- "13:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "tbnz %x[flags], #31, 14f\n"
- ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- "14:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q8, [x11, #0x0]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x10]\n"
- "ldr q10, [x11, #0x20]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [x11, #0x30]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "sub x27, x27, #0x4\n"
- "add x11, x11, #0x40\n"
- ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
- "cmp x27, #0x4\n"
- "bge 13b\n"
- "cbz x27, 19f\n"
- "15:" // Height 1: Multiply loop: Skip odd blocks
- "tbz x27, #1, 16f\n"
- "ldr h0, [x26], #0x2\n"
- "tbz x27, #0, 17f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "b 17f\n"
- "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "17:" // Height 1: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 18f\n"
- ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- "18:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q5, [x11, #0x0]\n"
- ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x11, #0x10]\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
- "ldr q8, [x11, #0x30]\n"
- ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
- "19:" // Height 1: Multiply loop: No odd multiplies
+ "17:" // Height 1: Multiply loop: unique 4: skip row sum
+ "ldr q10, [x28, #0x0]\n"
+ ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x10]\n"
+ "ldr q5, [x28, #0x20]\n"
+ ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
+ "ldr q6, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
+ "18:" // Height 1: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x19\n"
- "bne 5b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "tbnz %x[flags], #31, 20f\n"
+ "add x25, x25, #0x1\n"
+ "cmp x25, x19\n"
+ "bne 4b\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "tbnz %x[flags], #31, 19f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x19, %x[qp], %[b_offset]\n"
+ "add x22, %x[qp], %[b_offset]\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "ld1r { v1.4s }, [x19]\n"
+ "ld1r { v1.4s }, [x22]\n"
"neg v1.4s, v1.4s\n"
"mul v11.4s, v11.4s, v1.4s\n"
- "20:" // Height 1: skip row sum fixup
+ "19:" // Height 1: skip row sum fixup
"add v16.4s, v16.4s, v11.4s\n"
+ "ldr q0, [x27, #0x0]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x27, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q2, [x27, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q3, [x27, #0x30]\n"
+ "add x27, x27, #0x40\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "ld1r { v4.4s }, [x22]\n"
"add v17.4s, v17.4s, v1.4s\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
"add v18.4s, v18.4s, v2.4s\n"
- "ld1r { v4.4s }, [x19]\n"
- "add x10, x10, #0x40\n"
"add v19.4s, v19.4s, v3.4s\n"
"sqrdmulh v16.4s, v16.4s, v4.4s\n"
"sqrdmulh v17.4s, v17.4s, v4.4s\n"
"sqrdmulh v18.4s, v18.4s, v4.4s\n"
"sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "tbz %x[flags], #5, 21f\n"
+ "tbz %x[flags], #5, 20f\n"
"and v4.16b, v16.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v17.16b, v0.16b\n"
@@ -303,18 +294,18 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqadd v17.4s, v17.4s, v5.4s\n"
"sqadd v18.4s, v18.4s, v6.4s\n"
"sqadd v19.4s, v19.4s, v7.4s\n"
- "21:" // Height 1: no shift correction
+ "20:" // Height 1: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x22, %x[qp], %[minval]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x12, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "cmp x9, #0x10\n"
"add v16.4s, v16.4s, v4.4s\n"
"add v17.4s, v17.4s, v4.4s\n"
"add v18.4s, v18.4s, v4.4s\n"
@@ -330,80 +321,69 @@ void a64_hybrid_s8qa_dot_4x16 (
"smax v19.4s, v19.4s, v5.4s\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
- "bge 30f\n"
- "tbz x12, #3, 25f\n"
- "str d16, [x9], #0x8\n"
- "tbz x12, #2, 23f\n"
- "st1 { v16.s }[2], [x9], #0x4\n"
- "tbz x12, #1, 22f\n"
- "st1 { v16.h }[6], [x9], #0x2\n"
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[14], [x9]\n"
- "b 29f\n"
- "22:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[12], [x9]\n"
- "b 29f\n"
- "23:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x12, #1, 24f\n"
- "st1 { v16.h }[4], [x9], #0x2\n"
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[10], [x9]\n"
- "b 29f\n"
- "24:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[8], [x9]\n"
- "b 29f\n"
- "25:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x12, #2, 27f\n"
- "str s16, [x9], #0x4\n"
- "tbz x12, #1, 26f\n"
- "st1 { v16.h }[2], [x9], #0x2\n"
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[6], [x9]\n"
- "b 29f\n"
- "26:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[4], [x9]\n"
- "b 29f\n"
- "27:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x12, #1, 28f\n"
- "str h16, [x9], #0x2\n"
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[2], [x9]\n"
- "b 29f\n"
- "28:" // Height 1: Partial direct writeback: partial_1_0
- "str b16, [x9, #0x0]\n"
- "29:" // Height 1: Partial direct writeback: Done
- "b 31f\n"
- "30:" // Height 1: Full writeback
- "str q16, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "31:" // Height 1: Writeback done
- "subs x12, x12, #0x10\n"
- "bgt 3b\n"
- "b 126f\n"
- "32:" // Height 2
+ "bge 29f\n"
+ "tbz x9, #3, 24f\n"
+ "str d16, [x26], #0x8\n"
+ "tbz x9, #2, 22f\n"
+ "st1 { v16.s }[2], [x26], #0x4\n"
+ "tbz x9, #1, 21f\n"
+ "st1 { v16.h }[6], [x26], #0x2\n"
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[14], [x26]\n"
+ "b 28f\n"
+ "21:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[12], [x26]\n"
+ "b 28f\n"
+ "22:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x9, #1, 23f\n"
+ "st1 { v16.h }[4], [x26], #0x2\n"
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[10], [x26]\n"
+ "b 28f\n"
+ "23:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[8], [x26]\n"
+ "b 28f\n"
+ "24:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x9, #2, 26f\n"
+ "str s16, [x26], #0x4\n"
+ "tbz x9, #1, 25f\n"
+ "st1 { v16.h }[2], [x26], #0x2\n"
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[6], [x26]\n"
+ "b 28f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[4], [x26]\n"
+ "b 28f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x9, #1, 27f\n"
+ "str h16, [x26], #0x2\n"
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[2], [x26]\n"
+ "b 28f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_0
+ "str b16, [x26, #0x0]\n"
+ "28:" // Height 1: Partial direct writeback: Done
+ "b 30f\n"
+ "29:" // Height 1: Full writeback
+ "str q16, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "30:" // Height 1: Writeback done
+ "subs x9, x9, #0x10\n"
+ "bgt 2b\n"
+ "b 122f\n"
+ "31:" // Height 2
"movi v11.4s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"movi v12.4s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
- "movi v13.4s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "movi v14.4s, #0x0\n"
"movi v15.16b, #0x1\n"
- "tbz %x[flags], #2, 33f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "add x25, x25, x19\n"
- "b 34f\n"
- "33:" // Height 2: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "34:" // Height 2: Column loop
+ "mov x26, %x[output_ptr]\n"
+ "32:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
@@ -412,71 +392,135 @@ void a64_hybrid_s8qa_dot_4x16 (
"movi v21.4s, #0x0\n"
"movi v22.4s, #0x0\n"
"movi v23.4s, #0x0\n"
- "35:" // Height 2: setup done
- "mov x28, #0x0\n"
- "36:" // Height 2: String loop
+ "33:" // Height 2: setup done
+ "mov x25, #0x0\n"
+ "34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 37f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 35f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "cbnz x28, 38f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "cbnz x25, 36f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
- "b 38f\n"
- "37:" // Height 2: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "38:" // Height 2: input setup done
- "cmp x27, #0x10\n"
- "blt 43f\n"
- "cmp x27, #0x20\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 36f\n"
+ "35:" // Height 2: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "36:" // Height 2: input setup done
+ "cmp x24, #0x10\n"
"blt 41f\n"
- "39:" // Height 2: Multiply loop: Main loop head
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q4, [x11, #0x0]\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "ldr q4, [x28, #0x0]\n"
+ "blt 39f\n"
+ "37:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x10]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x20]\n"
- "ldr q7, [x11, #0x30]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q8, [x11, #0x40]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q9, [x11, #0x50]\n"
+ "ldr q8, [x28, #0x40]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x60]\n"
+ "ldr q9, [x28, #0x50]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q4, [x11, #0x70]\n"
+ "ldr q10, [x28, #0x60]\n"
+ "ldr q4, [x28, #0x70]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x80]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x90]\n"
+ "ldr q5, [x28, #0x80]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q7, [x11, #0xa0]\n"
+ "ldr q6, [x28, #0x90]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q8, [x11, #0xb0]\n"
+ "ldr q7, [x28, #0xa0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
- "add x26, x26, #0x10\n"
+ "ldr q8, [x28, #0xb0]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x28, #0xd0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x28, #0xe0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 38f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "38:" // Height 2: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x20\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "bge 37b\n"
+ "39:" // Height 2: Multiply loop: Single iteration only
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "sub x24, x24, #0x10\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q8, [x28, #0x40]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q9, [x28, #0x50]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr q10, [x28, #0x60]\n"
+ "ldr q4, [x28, #0x70]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q6, [x28, #0x90]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ "ldr q7, [x28, #0xa0]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q8, [x28, #0xb0]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q9, [x11, #0xc0]\n"
+ "ldr q9, [x28, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
- "add x24, x24, #0x10\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x11, #0xd0]\n"
+ "ldr q10, [x28, #0xd0]\n"
".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x11, #0xe0]\n"
+ "ldr q4, [x28, #0xe0]\n"
".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr q5, [x11, #0xf0]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
@@ -493,171 +537,109 @@ void a64_hybrid_s8qa_dot_4x16 (
"tbnz %x[flags], #31, 40f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
- "40:" // Height 2: Multiply loop: unique 5: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x20\n"
- "bge 39b\n"
- "41:" // Height 2: Multiply loop: Single iteration only
- "sub x27, x27, #0x10\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q6, [x11, #0x0]\n"
+ "40:" // Height 2: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "41:" // Height 2: Multiply loop: Main loop skip
+ "cbz x24, 48f\n"
+ "cmp x24, #0x4\n"
+ "blt 44f\n"
+ "42:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x23], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "tbnz %x[flags], #31, 43f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "43:" // Height 2: Multiply loop: unique 7: skip row sum
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x24, x24, #0x4\n"
".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
- "ldr q8, [x11, #0x20]\n"
- "ldr q9, [x11, #0x30]\n"
+ "ldr q8, [x28, #0x20]\n"
+ "cmp x24, #0x4\n"
".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x40]\n"
+ "ldr q9, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
- "ldr q4, [x11, #0x50]\n"
".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x60]\n"
".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x70]\n"
".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x80]\n"
".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [x11, #0x90]\n"
- ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
- "ldr q9, [x11, #0xa0]\n"
- ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x11, #0xb0]\n"
- ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x11, #0xc0]\n"
- ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n"
- "ldr q5, [x11, #0xd0]\n"
- ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x11, #0xe0]\n"
- ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x11, #0xf0]\n"
- ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
- ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n"
- "tbnz %x[flags], #31, 42f\n"
- ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
- "42:" // Height 2: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "43:" // Height 2: Multiply loop: Main loop skip
- "cbz x27, 50f\n"
- "cmp x27, #0x4\n"
- "blt 46f\n"
- "44:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x24], #0x4\n"
- "tbnz %x[flags], #31, 45f\n"
+ "bge 42b\n"
+ "cbz x24, 48f\n"
+ "44:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x24, #1, 45f\n"
+ "ldr h0, [x23], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "tbz x24, #0, 46f\n"
+ "ld1 { v0.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "b 46f\n"
+ "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x23, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "46:" // Height 2: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 47f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
- "45:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q8, [x11, #0x0]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q10, [x11, #0x20]\n"
- "ldr q4, [x11, #0x30]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "sub x27, x27, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "cmp x27, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
- "bge 44b\n"
- "cbz x27, 50f\n"
- "46:" // Height 2: Multiply loop: Skip odd blocks
- "tbz x27, #1, 47f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x24], #0x2\n"
- "tbz x27, #0, 48f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x24]\n"
- "b 48f\n"
- "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x24, #0x0]\n"
- "48:" // Height 2: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 49f\n"
- ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
- "49:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q5, [x11, #0x0]\n"
- ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
- "ldr q7, [x11, #0x20]\n"
- "ldr q8, [x11, #0x30]\n"
- ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n"
- "50:" // Height 2: Multiply loop: No odd multiplies
+ "47:" // Height 2: Multiply loop: unique 8: skip row sum
+ "ldr q10, [x28, #0x0]\n"
+ ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x10]\n"
+ ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ "ldr q6, [x28, #0x30]\n"
+ ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
+ "add x28, x28, #0x40\n"
+ ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
+ "48:" // Height 2: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x19\n"
- "bne 36b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "tbnz %x[flags], #31, 51f\n"
+ "add x25, x25, #0x1\n"
+ "cmp x25, x19\n"
+ "bne 34b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x21, x26, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 49f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x19, %x[qp], %[b_offset]\n"
- "ld1r { v2.4s }, [x19]\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x22]\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"neg v2.4s, v2.4s\n"
"mul v11.4s, v11.4s, v2.4s\n"
"mul v12.4s, v12.4s, v2.4s\n"
- "51:" // Height 2: skip row sum fixup
+ "49:" // Height 2: skip row sum fixup
"add v16.4s, v16.4s, v11.4s\n"
+ "ldr q0, [x27, #0x0]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x27, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q2, [x27, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x27, #0x30]\n"
+ "add x27, x27, #0x40\n"
"add v20.4s, v20.4s, v12.4s\n"
- "ldr q3, [x10, #0x30]\n"
+ "ld1r { v4.4s }, [x22]\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v22.4s, v22.4s, v12.4s\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add x10, x10, #0x40\n"
"add v16.4s, v16.4s, v0.4s\n"
"add v17.4s, v17.4s, v1.4s\n"
"add v18.4s, v18.4s, v2.4s\n"
"add v19.4s, v19.4s, v3.4s\n"
"add v20.4s, v20.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x23]\n"
"add v21.4s, v21.4s, v1.4s\n"
"add v22.4s, v22.4s, v2.4s\n"
"add v23.4s, v23.4s, v3.4s\n"
@@ -669,7 +651,7 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqrdmulh v21.4s, v21.4s, v4.4s\n"
"sqrdmulh v22.4s, v22.4s, v4.4s\n"
"sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "tbz %x[flags], #5, 52f\n"
+ "tbz %x[flags], #5, 50f\n"
"and v4.16b, v16.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v17.16b, v0.16b\n"
@@ -694,18 +676,18 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqadd v21.4s, v21.4s, v9.4s\n"
"sqadd v22.4s, v22.4s, v10.4s\n"
"sqadd v23.4s, v23.4s, v4.4s\n"
- "52:" // Height 2: no shift correction
+ "50:" // Height 2: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x22, %x[qp], %[minval]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x12, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "cmp x9, #0x10\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
@@ -740,100 +722,86 @@ void a64_hybrid_s8qa_dot_4x16 (
"uzp1 v16.16b, v16.16b, v17.16b\n"
"uzp1 v21.8h, v22.8h, v23.8h\n"
"uzp1 v20.16b, v20.16b, v21.16b\n"
- "bge 61f\n"
- "tbz x12, #3, 56f\n"
- "str d16, [x9], #0x8\n"
- "str d20, [x25], #0x8\n"
- "tbz x12, #2, 54f\n"
- "st1 { v16.s }[2], [x9], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "tbz x12, #1, 53f\n"
- "st1 { v16.h }[6], [x9], #0x2\n"
- "st1 { v20.h }[6], [x25], #0x2\n"
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[14], [x9]\n"
- "st1 { v20.b }[14], [x25]\n"
- "b 60f\n"
- "53:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[12], [x9]\n"
- "st1 { v20.b }[12], [x25]\n"
- "b 60f\n"
- "54:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x12, #1, 55f\n"
- "st1 { v16.h }[4], [x9], #0x2\n"
- "st1 { v20.h }[4], [x25], #0x2\n"
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[10], [x9]\n"
- "st1 { v20.b }[10], [x25]\n"
- "b 60f\n"
- "55:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[8], [x9]\n"
- "st1 { v20.b }[8], [x25]\n"
- "b 60f\n"
- "56:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x12, #2, 58f\n"
- "str s16, [x9], #0x4\n"
- "str s20, [x25], #0x4\n"
- "tbz x12, #1, 57f\n"
- "st1 { v16.h }[2], [x9], #0x2\n"
- "st1 { v20.h }[2], [x25], #0x2\n"
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[6], [x9]\n"
- "st1 { v20.b }[6], [x25]\n"
- "b 60f\n"
- "57:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[4], [x9]\n"
- "st1 { v20.b }[4], [x25]\n"
+ "bge 59f\n"
+ "tbz x9, #3, 54f\n"
+ "str d16, [x26], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x9, #2, 52f\n"
+ "st1 { v16.s }[2], [x26], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "tbz x9, #1, 51f\n"
+ "st1 { v16.h }[6], [x26], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[14], [x26]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "b 58f\n"
+ "51:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[12], [x26]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "b 58f\n"
+ "52:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x9, #1, 53f\n"
+ "st1 { v16.h }[4], [x26], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[10], [x26]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "b 58f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[8], [x26]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "b 58f\n"
+ "54:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x9, #2, 56f\n"
+ "str s16, [x26], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "tbz x9, #1, 55f\n"
+ "st1 { v16.h }[2], [x26], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[6], [x26]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "b 58f\n"
+ "55:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[4], [x26]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "b 58f\n"
+ "56:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x9, #1, 57f\n"
+ "str h16, [x26], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[2], [x26]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "b 58f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_0
+ "str b16, [x26, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
- "58:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x12, #1, 59f\n"
- "str h16, [x9], #0x2\n"
- "str h20, [x25], #0x2\n"
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[2], [x9]\n"
- "st1 { v20.b }[2], [x25]\n"
- "b 60f\n"
- "59:" // Height 2: Partial direct writeback: partial_1_0
- "str b16, [x9, #0x0]\n"
- "str b20, [x25, #0x0]\n"
- "60:" // Height 2: Partial direct writeback: Done
- "b 62f\n"
- "61:" // Height 2: Full writeback
- "str q16, [x9, #0x0]\n"
- "str q20, [x25, #0x0]\n"
- "add x9, x9, #0x10\n"
- "add x25, x25, #0x10\n"
- "62:" // Height 2: Writeback done
- "subs x12, x12, #0x10\n"
- "bgt 34b\n"
- "b 126f\n"
- "63:" // Height 3
+ "59:" // Height 2: Full writeback
+ "str q16, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "60:" // Height 2: Writeback done
+ "subs x9, x9, #0x10\n"
+ "bgt 32b\n"
+ "b 122f\n"
+ "61:" // Height 3
"movi v11.4s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"movi v12.4s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "movi v14.4s, #0x0\n"
+ "mov x26, %x[output_ptr]\n"
"movi v15.16b, #0x1\n"
- "tbz %x[flags], #2, 64f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "ldr x23, [%x[output_ptr], #0x10]\n"
- "add x25, x25, x19\n"
- "add x23, x23, x19\n"
- "b 65f\n"
- "64:" // Height 3: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "add x23, x25, x19\n"
- "65:" // Height 3: Column loop
+ "62:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
@@ -846,85 +814,169 @@ void a64_hybrid_s8qa_dot_4x16 (
"movi v25.4s, #0x0\n"
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- "66:" // Height 3: setup done
- "mov x28, #0x0\n"
- "67:" // Height 3: String loop
+ "63:" // Height 3: setup done
+ "mov x25, #0x0\n"
+ "64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 68f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 65f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "cbnz x28, 69f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
+ "cbnz x25, 66f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 69f\n"
- "68:" // Height 3: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "69:" // Height 3: input setup done
- "cmp x27, #0x10\n"
- "blt 74f\n"
- "cmp x27, #0x20\n"
- "blt 72f\n"
- "70:" // Height 3: Multiply loop: Main loop head
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q4, [x11, #0x0]\n"
+ "add x21, x21, x19\n"
+ "b 66f\n"
+ "65:" // Height 3: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "66:" // Height 3: input setup done
+ "cmp x24, #0x10\n"
+ "blt 71f\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "blt 69f\n"
+ "67:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x10]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q7, [x11, #0x30]\n"
- "ldr q8, [x11, #0x40]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x50]\n"
+ "ldr q8, [x28, #0x40]\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q10, [x11, #0x60]\n"
+ "ldr q9, [x28, #0x50]\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q4, [x11, #0x70]\n"
+ "ldr q10, [x28, #0x60]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x80]\n"
+ "ldr q4, [x28, #0x70]\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "ldr q5, [x28, #0x80]\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x11, #0x90]\n"
+ "ldr q6, [x28, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x28, #0xa0]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ "ldr q8, [x28, #0xb0]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x28, #0xd0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x28, #0xe0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 68f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "68:" // Height 3: Multiply loop: unique 9: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x20\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "bge 67b\n"
+ "69:" // Height 3: Multiply loop: Single iteration only
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "sub x24, x24, #0x10\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x28, #0x30]\n"
"add x22, x22, #0x10\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q8, [x28, #0x40]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q9, [x28, #0x50]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ "ldr q10, [x28, #0x60]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x70]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x28, #0x90]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x11, #0xa0]\n"
+ "ldr q7, [x28, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q8, [x11, #0xb0]\n"
+ "ldr q8, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q9, [x11, #0xc0]\n"
+ "ldr q9, [x28, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x11, #0xd0]\n"
+ "ldr q10, [x28, #0xd0]\n"
".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x11, #0xe0]\n"
+ "ldr q4, [x28, #0xe0]\n"
".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr q5, [x11, #0xf0]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
@@ -945,181 +997,100 @@ void a64_hybrid_s8qa_dot_4x16 (
".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
- "tbnz %x[flags], #31, 71f\n"
+ "tbnz %x[flags], #31, 70f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
- "71:" // Height 3: Multiply loop: unique 9: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x20\n"
+ "70:" // Height 3: Multiply loop: unique 10: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "bge 70b\n"
- "72:" // Height 3: Multiply loop: Single iteration only
- "sub x27, x27, #0x10\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q6, [x11, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "71:" // Height 3: Multiply loop: Main loop skip
+ "cbz x24, 78f\n"
+ "cmp x24, #0x4\n"
+ "blt 74f\n"
+ "72:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x23], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "tbnz %x[flags], #31, 73f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "73:" // Height 3: Multiply loop: unique 11: skip row sum
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x24, x24, #0x4\n"
".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
- "ldr q8, [x11, #0x20]\n"
+ "ldr q8, [x28, #0x20]\n"
+ "cmp x24, #0x4\n"
".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
- "ldr q9, [x11, #0x30]\n"
- "ldr q10, [x11, #0x40]\n"
+ "ldr q9, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- "ldr q4, [x11, #0x50]\n"
".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
- "ldr q5, [x11, #0x60]\n"
".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
- "ldr q6, [x11, #0x70]\n"
".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x80]\n"
".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
- "ldr q8, [x11, #0x90]\n"
".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [x11, #0xa0]\n"
- ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n"
- ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x11, #0xb0]\n"
- ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x11, #0xc0]\n"
- ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n"
- "ldr q5, [x11, #0xd0]\n"
- ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x11, #0xe0]\n"
- ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x11, #0xf0]\n"
- ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
- ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
- "tbnz %x[flags], #31, 73f\n"
- ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
- "73:" // Height 3: Multiply loop: unique 10: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "74:" // Height 3: Multiply loop: Main loop skip
- "cbz x27, 81f\n"
- "cmp x27, #0x4\n"
- "blt 77f\n"
- "75:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x24], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "tbnz %x[flags], #31, 76f\n"
- ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
- "76:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q8, [x11, #0x0]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q10, [x11, #0x20]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q4, [x11, #0x30]\n"
- "sub x27, x27, #0x4\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "cmp x27, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
- "bge 75b\n"
- "cbz x27, 81f\n"
- "77:" // Height 3: Multiply loop: Skip odd blocks
- "tbz x27, #1, 78f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x24], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "tbz x27, #0, 79f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x24]\n"
- "ld1 { v2.b }[2], [x22]\n"
- "b 79f\n"
- "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x24, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
- "79:" // Height 3: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 80f\n"
+ "bge 72b\n"
+ "cbz x24, 78f\n"
+ "74:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x24, #1, 75f\n"
+ "ldr h0, [x23], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
+ "tbz x24, #0, 76f\n"
+ "ld1 { v0.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v2.b }[2], [x21]\n"
+ "b 76f\n"
+ "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x23, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "ldr b2, [x21, #0x0]\n"
+ "76:" // Height 3: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 77f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
- "80:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q5, [x11, #0x0]\n"
- ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n"
- "ldr q8, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n"
- "81:" // Height 3: Multiply loop: No odd multiplies
+ "77:" // Height 3: Multiply loop: unique 12: skip row sum
+ "ldr q10, [x28, #0x0]\n"
+ ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x10]\n"
+ ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n"
+ "ldr q6, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n"
+ "78:" // Height 3: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x19\n"
- "bne 67b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "tbnz %x[flags], #31, 82f\n"
+ "add x25, x25, #0x1\n"
+ "cmp x25, x19\n"
+ "bne 64b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x21, x26, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "tbnz %x[flags], #31, 79f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x19, %x[qp], %[b_offset]\n"
- "ld1r { v3.4s }, [x19]\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v3.4s }, [x22]\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
@@ -1129,24 +1100,24 @@ void a64_hybrid_s8qa_dot_4x16 (
"mul v11.4s, v11.4s, v3.4s\n"
"mul v12.4s, v12.4s, v3.4s\n"
"mul v13.4s, v13.4s, v3.4s\n"
- "82:" // Height 3: skip row sum fixup
+ "79:" // Height 3: skip row sum fixup
"add v16.4s, v16.4s, v11.4s\n"
+ "ldr q0, [x27, #0x0]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x27, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q2, [x27, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x27, #0x30]\n"
+ "add x27, x27, #0x40\n"
"add v20.4s, v20.4s, v12.4s\n"
- "ldr q3, [x10, #0x30]\n"
+ "ld1r { v4.4s }, [x22]\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v22.4s, v22.4s, v12.4s\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add x10, x10, #0x40\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add v26.4s, v26.4s, v13.4s\n"
@@ -1160,7 +1131,7 @@ void a64_hybrid_s8qa_dot_4x16 (
"add v22.4s, v22.4s, v2.4s\n"
"add v23.4s, v23.4s, v3.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x23]\n"
"add v25.4s, v25.4s, v1.4s\n"
"add v26.4s, v26.4s, v2.4s\n"
"add v27.4s, v27.4s, v3.4s\n"
@@ -1176,7 +1147,7 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqrdmulh v25.4s, v25.4s, v4.4s\n"
"sqrdmulh v26.4s, v26.4s, v4.4s\n"
"sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "tbz %x[flags], #5, 83f\n"
+ "tbz %x[flags], #5, 80f\n"
"and v4.16b, v16.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v17.16b, v0.16b\n"
@@ -1213,18 +1184,18 @@ void a64_hybrid_s8qa_dot_4x16 (
"sshr v8.4s, v8.4s, #0x1f\n"
"sqadd v26.4s, v26.4s, v7.4s\n"
"sqadd v27.4s, v27.4s, v8.4s\n"
- "83:" // Height 3: no shift correction
+ "80:" // Height 3: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x22, %x[qp], %[minval]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x12, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "cmp x9, #0x10\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
@@ -1278,122 +1249,106 @@ void a64_hybrid_s8qa_dot_4x16 (
"uzp1 v16.16b, v16.16b, v17.16b\n"
"uzp1 v20.16b, v20.16b, v21.16b\n"
"uzp1 v24.16b, v24.16b, v25.16b\n"
- "bge 92f\n"
- "tbz x12, #3, 87f\n"
- "str d16, [x9], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x12, #2, 85f\n"
- "st1 { v16.s }[2], [x9], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x12, #1, 84f\n"
- "st1 { v16.h }[6], [x9], #0x2\n"
- "st1 { v20.h }[6], [x25], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[14], [x9]\n"
- "st1 { v20.b }[14], [x25]\n"
- "st1 { v24.b }[14], [x23]\n"
- "b 91f\n"
- "84:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[12], [x9]\n"
- "st1 { v20.b }[12], [x25]\n"
- "st1 { v24.b }[12], [x23]\n"
- "b 91f\n"
- "85:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x12, #1, 86f\n"
- "st1 { v16.h }[4], [x9], #0x2\n"
- "st1 { v20.h }[4], [x25], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[10], [x9]\n"
- "st1 { v20.b }[10], [x25]\n"
- "st1 { v24.b }[10], [x23]\n"
- "b 91f\n"
- "86:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[8], [x9]\n"
- "st1 { v20.b }[8], [x25]\n"
- "st1 { v24.b }[8], [x23]\n"
- "b 91f\n"
- "87:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x12, #2, 89f\n"
- "str s16, [x9], #0x4\n"
- "str s20, [x25], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x12, #1, 88f\n"
- "st1 { v16.h }[2], [x9], #0x2\n"
- "st1 { v20.h }[2], [x25], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[6], [x9]\n"
- "st1 { v20.b }[6], [x25]\n"
- "st1 { v24.b }[6], [x23]\n"
- "b 91f\n"
- "88:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[4], [x9]\n"
- "st1 { v20.b }[4], [x25]\n"
- "st1 { v24.b }[4], [x23]\n"
- "b 91f\n"
- "89:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x12, #1, 90f\n"
- "str h16, [x9], #0x2\n"
- "str h20, [x25], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[2], [x9]\n"
- "st1 { v20.b }[2], [x25]\n"
- "st1 { v24.b }[2], [x23]\n"
- "b 91f\n"
- "90:" // Height 3: Partial direct writeback: partial_1_0
- "str b16, [x9, #0x0]\n"
- "str b20, [x25, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "91:" // Height 3: Partial direct writeback: Done
- "b 93f\n"
- "92:" // Height 3: Full writeback
- "str q16, [x9, #0x0]\n"
- "str q20, [x25, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "add x9, x9, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x23, x23, #0x10\n"
- "93:" // Height 3: Writeback done
- "subs x12, x12, #0x10\n"
- "bgt 65b\n"
- "b 126f\n"
- "94:" // Height 4
+ "bge 89f\n"
+ "tbz x9, #3, 84f\n"
+ "str d16, [x26], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x9, #2, 82f\n"
+ "st1 { v16.s }[2], [x26], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "tbz x9, #1, 81f\n"
+ "st1 { v16.h }[6], [x26], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[14], [x26]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "b 88f\n"
+ "81:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[12], [x26]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "b 88f\n"
+ "82:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x9, #1, 83f\n"
+ "st1 { v16.h }[4], [x26], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[10], [x26]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "b 88f\n"
+ "83:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[8], [x26]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "b 88f\n"
+ "84:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x9, #2, 86f\n"
+ "str s16, [x26], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "tbz x9, #1, 85f\n"
+ "st1 { v16.h }[2], [x26], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[6], [x26]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "b 88f\n"
+ "85:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[4], [x26]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "b 88f\n"
+ "86:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x9, #1, 87f\n"
+ "str h16, [x26], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[2], [x26]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "b 88f\n"
+ "87:" // Height 3: Partial direct writeback: partial_1_0
+ "str b16, [x26, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "88:" // Height 3: Partial direct writeback: Done
+ "b 90f\n"
+ "89:" // Height 3: Full writeback
+ "str q16, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "90:" // Height 3: Writeback done
+ "subs x9, x9, #0x10\n"
+ "bgt 62b\n"
+ "b 122f\n"
+ "91:" // Height 4
"movi v11.4s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"movi v12.4s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x26, %x[output_ptr]\n"
"movi v14.4s, #0x0\n"
+ "mov x19, #0x4\n"
"movi v15.16b, #0x1\n"
- "tbz %x[flags], #2, 95f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "ldr x23, [%x[output_ptr], #0x10]\n"
- "ldr x21, [%x[output_ptr], #0x18]\n"
- "add x25, x25, x19\n"
- "add %x[output_ptr], %x[output_ptr], #0x20\n"
- "add x23, x23, x19\n"
- "add x21, x21, x19\n"
- "b 96f\n"
- "95:" // Height 4: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "add x23, x25, x19\n"
- "add x21, x23, x19\n"
- "add %x[output_ptr], x21, x19\n"
- "96:" // Height 4: Column loop
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "92:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
@@ -1410,99 +1365,99 @@ void a64_hybrid_s8qa_dot_4x16 (
"movi v29.4s, #0x0\n"
"movi v30.4s, #0x0\n"
"movi v31.4s, #0x0\n"
- "97:" // Height 4: setup done
- "mov x28, #0x0\n"
- "98:" // Height 4: String loop
+ "93:" // Height 4: setup done
+ "mov x25, #0x0\n"
+ "94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 99f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 95f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
"ldr x20, [x20, #0x18]\n"
- "cbnz x28, 100f\n"
+ "cbnz x25, 96f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 100f\n"
- "99:" // Height 4: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "100:" // Height 4: input setup done
- "cmp x27, #0x10\n"
- "blt 105f\n"
- "cmp x27, #0x20\n"
- "blt 103f\n"
- "101:" // Height 4: Multiply loop: Main loop head
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
+ "b 96f\n"
+ "95:" // Height 4: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "96:" // Height 4: input setup done
+ "cmp x24, #0x10\n"
+ "blt 101f\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "ldr q2, [x21, #0x0]\n"
"ldr q3, [x20, #0x0]\n"
- "ldr q4, [x11, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "blt 99f\n"
+ "97:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x10]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q7, [x11, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr q8, [x11, #0x40]\n"
- "ldr q9, [x11, #0x50]\n"
+ "ldr q8, [x28, #0x40]\n"
+ "add x20, x20, #0x10\n"
".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x60]\n"
+ "ldr q9, [x28, #0x50]\n"
".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q4, [x11, #0x70]\n"
+ "ldr q10, [x28, #0x60]\n"
".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "ldr q4, [x28, #0x70]\n"
".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr q5, [x11, #0x80]\n"
+ "ldr q5, [x28, #0x80]\n"
".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
- "add x20, x20, #0x10\n"
".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x11, #0x90]\n"
+ "ldr q6, [x28, #0x90]\n"
".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x11, #0xa0]\n"
+ "ldr q7, [x28, #0xa0]\n"
".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr q8, [x11, #0xb0]\n"
+ "ldr q8, [x28, #0xb0]\n"
".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr q9, [x11, #0xc0]\n"
+ "ldr q9, [x28, #0xc0]\n"
".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x11, #0xd0]\n"
+ "ldr q10, [x28, #0xd0]\n"
".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x11, #0xe0]\n"
+ "ldr q4, [x28, #0xe0]\n"
".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr q5, [x11, #0xf0]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1530,218 +1485,222 @@ void a64_hybrid_s8qa_dot_4x16 (
".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n"
- "tbnz %x[flags], #31, 102f\n"
+ "tbnz %x[flags], #31, 98f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
- "102:" // Height 4: Multiply loop: unique 13: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x20\n"
+ "98:" // Height 4: Multiply loop: unique 13: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x20\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "bge 101b\n"
- "103:" // Height 4: Multiply loop: Single iteration only
- "sub x27, x27, #0x10\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q2, [x21, #0x0]\n"
"ldr q3, [x20, #0x0]\n"
- "ldr q6, [x11, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "bge 97b\n"
+ "99:" // Height 4: Multiply loop: Single iteration only
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "sub x24, x24, #0x10\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr q8, [x28, #0x40]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q9, [x28, #0x50]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q10, [x28, #0x60]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ "ldr q4, [x28, #0x70]\n"
+ ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x28, #0x90]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x28, #0xa0]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
+ "ldr q8, [x28, #0xb0]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x28, #0xd0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x28, #0xe0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8fe // sdot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e93c // sdot v28.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e95d // sdot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e89e // sdot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 100f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "100:" // Height 4: Multiply loop: unique 14: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "101:" // Height 4: Multiply loop: Main loop skip
+ "cbz x24, 108f\n"
+ "cmp x24, #0x4\n"
+ "blt 104f\n"
+ "102:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x23], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr s3, [x20], #0x4\n"
+ "tbnz %x[flags], #31, 103f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "103:" // Height 4: Multiply loop: unique 15: skip row sum
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x24, x24, #0x4\n"
".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
- "ldr q8, [x11, #0x20]\n"
+ "ldr q8, [x28, #0x20]\n"
+ "cmp x24, #0x4\n"
".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
- "ldr q9, [x11, #0x30]\n"
+ "ldr q9, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
- "ldr q10, [x11, #0x40]\n"
- "ldr q4, [x11, #0x50]\n"
".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x60]\n"
".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x70]\n"
".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
- "add x26, x26, #0x10\n"
".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x11, #0x80]\n"
".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
- "add x20, x20, #0x10\n"
".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
- "ldr q8, [x11, #0x90]\n"
".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [x11, #0xa0]\n"
- ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n"
- ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n"
- ".inst 0x4fa3e15c // sdot v28.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x11, #0xb0]\n"
- ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n"
- ".inst 0x4fa3e09d // sdot v29.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x11, #0xc0]\n"
- ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0be // sdot v30.4s, v5.16b, v3.4b[1]\n"
- "ldr q5, [x11, #0xd0]\n"
- ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4fa3e0df // sdot v31.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x11, #0xe0]\n"
- ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x4f83e8fc // sdot v28.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x11, #0xf0]\n"
- ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
- ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e91d // sdot v29.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f83e93e // sdot v30.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f83e95f // sdot v31.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x4fa3e89c // sdot v28.4s, v4.16b, v3.4b[3]\n"
- ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8bd // sdot v29.4s, v5.16b, v3.4b[3]\n"
- ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8de // sdot v30.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x4fa3e8ff // sdot v31.4s, v7.16b, v3.4b[3]\n"
- "tbnz %x[flags], #31, 104f\n"
- ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
- "104:" // Height 4: Multiply loop: unique 14: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "105:" // Height 4: Multiply loop: Main loop skip
- "cbz x27, 112f\n"
- "cmp x27, #0x4\n"
- "blt 108f\n"
- "106:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x24], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s3, [x20], #0x4\n"
- "tbnz %x[flags], #31, 107f\n"
- ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
- "107:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q8, [x11, #0x0]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q10, [x11, #0x20]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q4, [x11, #0x30]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "sub x27, x27, #0x4\n"
- "add x11, x11, #0x40\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "cmp x27, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n"
- "bge 106b\n"
- "cbz x27, 112f\n"
- "108:" // Height 4: Multiply loop: Skip odd blocks
- "tbz x27, #1, 109f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x24], #0x2\n"
- "ldr h2, [x22], #0x2\n"
+ "bge 102b\n"
+ "cbz x24, 108f\n"
+ "104:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x24, #1, 105f\n"
+ "ldr h0, [x23], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
"ldr h3, [x20], #0x2\n"
- "tbz x27, #0, 110f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x24]\n"
- "ld1 { v2.b }[2], [x22]\n"
+ "tbz x24, #0, 106f\n"
+ "ld1 { v0.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v2.b }[2], [x21]\n"
"ld1 { v3.b }[2], [x20]\n"
- "b 110f\n"
- "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x24, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
+ "b 106f\n"
+ "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x23, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "ldr b2, [x21, #0x0]\n"
"ldr b3, [x20, #0x0]\n"
- "110:" // Height 4: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 111f\n"
+ "106:" // Height 4: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 107f\n"
".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
- "111:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q5, [x11, #0x0]\n"
- ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n"
- "ldr q8, [x11, #0x30]\n"
- ".inst 0x4f83e0bc // sdot v28.4s, v5.16b, v3.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x4f83e0fe // sdot v30.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f83e11f // sdot v31.4s, v8.16b, v3.4b[0]\n"
- "112:" // Height 4: Multiply loop: No odd multiplies
+ "107:" // Height 4: Multiply loop: unique 16: skip row sum
+ "ldr q10, [x28, #0x0]\n"
+ ".inst 0x4f80e150 // sdot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x10]\n"
+ ".inst 0x4f81e154 // sdot v20.4s, v10.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ ".inst 0x4f82e158 // sdot v24.4s, v10.16b, v2.4b[0]\n"
+ "ldr q6, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f80e091 // sdot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e099 // sdot v25.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09d // sdot v29.4s, v4.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0b2 // sdot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0ba // sdot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0be // sdot v30.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0d3 // sdot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d7 // sdot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0db // sdot v27.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0df // sdot v31.4s, v6.16b, v3.4b[0]\n"
+ "108:" // Height 4: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x19\n"
- "bne 98b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
+ "add x25, x25, #0x1\n"
+ "cmp x25, x19\n"
+ "bne 94b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x21, x26, x19\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "tbnz %x[flags], #31, 113f\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
+ "tbnz %x[flags], #31, 109f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x19, %x[qp], %[b_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
@@ -1754,24 +1713,24 @@ void a64_hybrid_s8qa_dot_4x16 (
"mul v12.4s, v12.4s, v4.4s\n"
"mul v13.4s, v13.4s, v4.4s\n"
"mul v14.4s, v14.4s, v4.4s\n"
- "113:" // Height 4: skip row sum fixup
+ "109:" // Height 4: skip row sum fixup
"add v16.4s, v16.4s, v11.4s\n"
+ "ldr q0, [x27, #0x0]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x27, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q2, [x27, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x27, #0x30]\n"
+ "add x27, x27, #0x40\n"
"add v20.4s, v20.4s, v12.4s\n"
- "ldr q3, [x10, #0x30]\n"
+ "ld1r { v4.4s }, [x22]\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v22.4s, v22.4s, v12.4s\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add x10, x10, #0x40\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add v26.4s, v26.4s, v13.4s\n"
@@ -1793,7 +1752,7 @@ void a64_hybrid_s8qa_dot_4x16 (
"add v26.4s, v26.4s, v2.4s\n"
"add v27.4s, v27.4s, v3.4s\n"
"add v28.4s, v28.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x23]\n"
"add v29.4s, v29.4s, v1.4s\n"
"add v30.4s, v30.4s, v2.4s\n"
"add v31.4s, v31.4s, v3.4s\n"
@@ -1813,7 +1772,7 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqrdmulh v29.4s, v29.4s, v4.4s\n"
"sqrdmulh v30.4s, v30.4s, v4.4s\n"
"sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "tbz %x[flags], #5, 114f\n"
+ "tbz %x[flags], #5, 110f\n"
"and v4.16b, v16.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v17.16b, v0.16b\n"
@@ -1862,18 +1821,18 @@ void a64_hybrid_s8qa_dot_4x16 (
"sqadd v29.4s, v29.4s, v10.4s\n"
"sqadd v30.4s, v30.4s, v4.4s\n"
"sqadd v31.4s, v31.4s, v5.4s\n"
- "114:" // Height 4: no shift correction
+ "110:" // Height 4: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x22, %x[qp], %[minval]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x12, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "cmp x9, #0x10\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
@@ -1946,125 +1905,122 @@ void a64_hybrid_s8qa_dot_4x16 (
"uzp1 v20.16b, v20.16b, v21.16b\n"
"uzp1 v24.16b, v24.16b, v25.16b\n"
"uzp1 v28.16b, v28.16b, v29.16b\n"
- "bge 123f\n"
- "tbz x12, #3, 118f\n"
- "str d16, [x9], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x21], #0x8\n"
- "tbz x12, #2, 116f\n"
- "st1 { v16.s }[2], [x9], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x21], #0x4\n"
- "tbz x12, #1, 115f\n"
- "st1 { v16.h }[6], [x9], #0x2\n"
- "st1 { v20.h }[6], [x25], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x21], #0x2\n"
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[14], [x9]\n"
- "st1 { v20.b }[14], [x25]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x21]\n"
- "b 122f\n"
- "115:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[12], [x9]\n"
- "st1 { v20.b }[12], [x25]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x21]\n"
- "b 122f\n"
- "116:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x12, #1, 117f\n"
- "st1 { v16.h }[4], [x9], #0x2\n"
- "st1 { v20.h }[4], [x25], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x21], #0x2\n"
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[10], [x9]\n"
- "st1 { v20.b }[10], [x25]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x21]\n"
- "b 122f\n"
- "117:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[8], [x9]\n"
- "st1 { v20.b }[8], [x25]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x21]\n"
- "b 122f\n"
- "118:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x12, #2, 120f\n"
- "str s16, [x9], #0x4\n"
- "str s20, [x25], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x21], #0x4\n"
- "tbz x12, #1, 119f\n"
- "st1 { v16.h }[2], [x9], #0x2\n"
- "st1 { v20.h }[2], [x25], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x21], #0x2\n"
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[6], [x9]\n"
- "st1 { v20.b }[6], [x25]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x21]\n"
- "b 122f\n"
- "119:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[4], [x9]\n"
- "st1 { v20.b }[4], [x25]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x21]\n"
- "b 122f\n"
- "120:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x12, #1, 121f\n"
- "str h16, [x9], #0x2\n"
- "str h20, [x25], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x21], #0x2\n"
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[2], [x9]\n"
- "st1 { v20.b }[2], [x25]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x21]\n"
- "b 122f\n"
- "121:" // Height 4: Partial direct writeback: partial_1_0
- "str b16, [x9, #0x0]\n"
- "str b20, [x25, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x21, #0x0]\n"
- "122:" // Height 4: Partial direct writeback: Done
- "b 124f\n"
- "123:" // Height 4: Full writeback
- "str q16, [x9, #0x0]\n"
- "str q20, [x25, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x21, #0x0]\n"
- "add x9, x9, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x21, x21, #0x10\n"
- "124:" // Height 4: Writeback done
- "subs x12, x12, #0x10\n"
- "bgt 96b\n"
+ "bge 119f\n"
+ "tbz x9, #3, 114f\n"
+ "str d16, [x26], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x9, #2, 112f\n"
+ "st1 { v16.s }[2], [x26], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "st1 { v28.s }[2], [x19], #0x4\n"
+ "tbz x9, #1, 111f\n"
+ "st1 { v16.h }[6], [x26], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "st1 { v28.h }[6], [x19], #0x2\n"
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[14], [x26]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "st1 { v28.b }[14], [x19]\n"
+ "b 118f\n"
+ "111:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[12], [x26]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "st1 { v28.b }[12], [x19]\n"
+ "b 118f\n"
+ "112:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x9, #1, 113f\n"
+ "st1 { v16.h }[4], [x26], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "st1 { v28.h }[4], [x19], #0x2\n"
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[10], [x26]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "st1 { v28.b }[10], [x19]\n"
+ "b 118f\n"
+ "113:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[8], [x26]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "st1 { v28.b }[8], [x19]\n"
+ "b 118f\n"
+ "114:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x9, #2, 116f\n"
+ "str s16, [x26], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "str s28, [x19], #0x4\n"
+ "tbz x9, #1, 115f\n"
+ "st1 { v16.h }[2], [x26], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "st1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[6], [x26]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "st1 { v28.b }[6], [x19]\n"
+ "b 118f\n"
+ "115:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[4], [x26]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "st1 { v28.b }[4], [x19]\n"
+ "b 118f\n"
+ "116:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x9, #1, 117f\n"
+ "str h16, [x26], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "str h28, [x19], #0x2\n"
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[2], [x26]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "st1 { v28.b }[2], [x19]\n"
+ "b 118f\n"
+ "117:" // Height 4: Partial direct writeback: partial_1_0
+ "str b16, [x26, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "str b28, [x19, #0x0]\n"
+ "118:" // Height 4: Partial direct writeback: Done
+ "b 120f\n"
+ "119:" // Height 4: Full writeback
+ "str q16, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q28, [x19, #0x0]\n"
+ "120:" // Height 4: Writeback done
+ "subs x9, x9, #0x10\n"
+ "bgt 92b\n"
"subs %x[M], %x[M], #0x4\n"
- "beq 126f\n"
+ "beq 122f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 125f\n"
+ "tbz %x[flags], #3, 121f\n"
"add x20, x20, #0x4\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "125:" // Update direct input
+ "121:" // Update direct input
"mov x19, #0x4\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "126:" // Exit
+ "122:" // Exit
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
index 4ddc743f01..eb5bdfe55c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -37,9 +37,9 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void a64_hybrid_s8qs_dot_6x16( ARGLIST );
+void a64_hybrid_s8qs_dot_6x16_a55( ARGLIST );
class cls_a64_hybrid_s8qs_dot_6x16
{
@@ -72,10 +72,11 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
- static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+ {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
- return { 7.5301 };
+ return { 8.28 };
default:
return { 27.5482 };
}
@@ -83,9 +84,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_s8qs_dot_6x16;
-
- cls_a64_hybrid_s8qs_dot_6x16(const CPUInfo *)
+ cls_a64_hybrid_s8qs_dot_6x16(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A55r1:
+ kernel=a64_hybrid_s8qs_dot_6x16_a55;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
new file mode 100644
index 0000000000..6da3f7a9e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
@@ -0,0 +1,3772 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qs_dot_6x16_a55 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+ struct KernelArgs {
+ const int32_t *multiplier_ptr = {};
+ const int32_t *shift_ptr = {};
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->per_channel_requant) {
+ flags |= 0x10;
+ ka.multiplier_ptr=qp->per_channel_muls + col_base;
+ ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+ }
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 136f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 109f\n"
+ "beq 82f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 55f\n"
+ "beq 28f\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x15, %x[col_bias]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "2:" // Height 1: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "3:" // Height 1: setup done
+ "mov x11, #0x0\n"
+ "4:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 5f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "cbnz x11, 6f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "b 6f\n"
+ "5:" // Height 1: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "6:" // Height 1: input setup done
+ "cmp x10, #0x10\n"
+ "blt 9f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q6, [x12, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "blt 8f\n"
+ "7:" // Height 1: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr d7, [x12, #0x10]\n"
+ "ldr x19, [x12, #0x18]\n"
+ "add x9, x9, #0x10\n"
+ "ldr d6, [x12, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr x28, [x12, #0x28]\n"
+ "cmp x10, #0x20\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x19, [x12, #0x38]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr d7, [x12, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr d6, [x12, #0x40]\n"
+ "ldr x28, [x12, #0x48]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x12, #0x58]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr d7, [x12, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr d6, [x12, #0x60]\n"
+ "ldr x28, [x12, #0x68]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x12, #0x78]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr d7, [x12, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr d6, [x12, #0x80]\n"
+ "ldr x28, [x12, #0x88]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x12, #0x98]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr d7, [x12, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr d6, [x12, #0xa0]\n"
+ "ldr x28, [x12, #0xa8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x12, #0xb8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr d7, [x12, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr d6, [x12, #0xc0]\n"
+ "ldr x28, [x12, #0xc8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x12, #0xd8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr d7, [x12, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr d6, [x12, #0xe0]\n"
+ "ldr x28, [x12, #0xe8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x12, #0xf8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr d7, [x12, #0xf0]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr d6, [x12, #0x0]\n"
+ "ldr x28, [x12, #0x8]\n"
+ "mov v7.d[1], x19\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "bge 7b\n"
+ "8:" // Height 1: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "ldr q6, [x12, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x12, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x12, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x12, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x12, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x12, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x12, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x12, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x12, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x12, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x12, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x12, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "9:" // Height 1: Multiply loop: Main loop skip
+ "cbz x10, 14f\n"
+ "cmp x10, #0x4\n"
+ "blt 11f\n"
+ "10:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q6, [x12, #0x0]\n"
+ "cmp x10, #0x4\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "bge 10b\n"
+ "cbz x10, 14f\n"
+ "11:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 12f\n"
+ "ldr h0, [x9], #0x2\n"
+ "tbz x10, #0, 13f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "b 13f\n"
+ "12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "13:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "14:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 4b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "ldr q1, [x15, #0x10]\n"
+ "ldr q2, [x15, #0x20]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "ldr q3, [x15, #0x30]\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add x15, x15, #0x40\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "tbz %x[flags], #4, 15f\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr q4, [x17, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr q5, [x17, #0x10]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "ldr q7, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "b 16f\n"
+ "15:" // Height 1: per layer parameters
+ "add x25, %x[qp], %[per_layer_right_shift]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v0.4s }, [x25]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "mov v1.16b, v0.16b\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "16:" // Height 1: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "tbz %x[flags], #5, 17f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "17:" // Height 1: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "add x25, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x25]\n"
+ "cmp x14, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "bge 26f\n"
+ "tbz x14, #3, 21f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x14, #2, 19f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "tbz x14, #1, 18f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "tbz x14, #0, 25f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "b 25f\n"
+ "18:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 25f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "b 25f\n"
+ "19:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 20f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "tbz x14, #0, 25f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "b 25f\n"
+ "20:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 25f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "b 25f\n"
+ "21:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 23f\n"
+ "str s8, [x13], #0x4\n"
+ "tbz x14, #1, 22f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "tbz x14, #0, 25f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "b 25f\n"
+ "22:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 25f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "b 25f\n"
+ "23:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 24f\n"
+ "str h8, [x13], #0x2\n"
+ "tbz x14, #0, 25f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "b 25f\n"
+ "24:" // Height 1: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "25:" // Height 1: Partial direct writeback: Done
+ "b 27f\n"
+ "26:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "27:" // Height 1: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 2b\n"
+ "b 164f\n"
+ "28:" // Height 2
+ "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x15, %x[col_bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "29:" // Height 2: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "30:" // Height 2: setup done
+ "mov x11, #0x0\n"
+ "31:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 32f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "cbnz x11, 33f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "b 33f\n"
+ "32:" // Height 2: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "33:" // Height 2: input setup done
+ "cmp x10, #0x10\n"
+ "blt 36f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q6, [x12, #0x0]\n"
+ "blt 35f\n"
+ "34:" // Height 2: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr d7, [x12, #0x10]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x19, [x12, #0x18]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr d6, [x12, #0x20]\n"
+ "cmp x10, #0x20\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x28]\n"
+ "ldr x19, [x12, #0x38]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x12, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0x40]\n"
+ "ldr x28, [x12, #0x48]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr d7, [x12, #0x50]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x12, #0x58]\n"
+ "ldr x28, [x12, #0x68]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0x60]\n"
+ "ldr x19, [x12, #0x78]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x12, #0x70]\n"
+ "ldr x28, [x12, #0x88]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0x80]\n"
+ "ldr x19, [x12, #0x98]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x12, #0x90]\n"
+ "ldr x28, [x12, #0xa8]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0xa0]\n"
+ "ldr x19, [x12, #0xb8]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x12, #0xb0]\n"
+ "ldr x28, [x12, #0xc8]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0xc0]\n"
+ "ldr x19, [x12, #0xd8]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x12, #0xd0]\n"
+ "ldr x28, [x12, #0xe8]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0xe0]\n"
+ "ldr x19, [x12, #0xf8]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x12, #0xf0]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x25, [x26, #0x8]\n"
+ "ldr d6, [x12, #0x0]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr x28, [x12, #0x8]\n"
+ "mov v0.d[1], x27\n"
+ "mov v1.d[1], x25\n"
+ "mov v6.d[1], x28\n"
+ "bge 34b\n"
+ "35:" // Height 2: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q6, [x12, #0x20]\n"
+ "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x12, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x12, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x12, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x12, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x12, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x12, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x12, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x12, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x12, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x12, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x12, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x12, #0xf0]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "36:" // Height 2: Multiply loop: Main loop skip
+ "cbz x10, 41f\n"
+ "cmp x10, #0x4\n"
+ "blt 38f\n"
+ "37:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "bge 37b\n"
+ "cbz x10, 41f\n"
+ "38:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 39f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "tbz x10, #0, 40f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "b 40f\n"
+ "39:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "40:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "41:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 31b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "ldr q1, [x15, #0x10]\n"
+ "add x23, x13, x19\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr q3, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "tbz %x[flags], #4, 42f\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr q4, [x17, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr q5, [x17, #0x10]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "ldr q7, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "b 43f\n"
+ "42:" // Height 2: per layer parameters
+ "add x25, %x[qp], %[per_layer_right_shift]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v0.4s }, [x25]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "mov v1.16b, v0.16b\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "43:" // Height 2: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "tbz %x[flags], #5, 44f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "44:" // Height 2: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "add x25, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x25]\n"
+ "cmp x14, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "bge 53f\n"
+ "tbz x14, #3, 48f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "tbz x14, #2, 46f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "tbz x14, #1, 45f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "tbz x14, #0, 52f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "b 52f\n"
+ "45:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 52f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "b 52f\n"
+ "46:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 47f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "tbz x14, #0, 52f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "b 52f\n"
+ "47:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 52f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "b 52f\n"
+ "48:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 50f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "tbz x14, #1, 49f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "tbz x14, #0, 52f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "b 52f\n"
+ "49:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 52f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "b 52f\n"
+ "50:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 51f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "tbz x14, #0, 52f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "b 52f\n"
+ "51:" // Height 2: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "52:" // Height 2: Partial direct writeback: Done
+ "b 54f\n"
+ "53:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q12, [x23, #0x0]\n"
+ "54:" // Height 2: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 29b\n"
+ "b 164f\n"
+ "55:" // Height 3
+ "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x15, %x[col_bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "56:" // Height 3: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "57:" // Height 3: setup done
+ "mov x11, #0x0\n"
+ "58:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 59f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "cbnz x11, 60f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 60f\n"
+ "59:" // Height 3: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "60:" // Height 3: input setup done
+ "cmp x10, #0x10\n"
+ "blt 63f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q6, [x12, #0x0]\n"
+ "blt 62f\n"
+ "61:" // Height 3: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr d7, [x12, #0x10]\n"
+ "ldr x19, [x12, #0x18]\n"
+ "add x9, x9, #0x10\n"
+ "ldr d6, [x12, #0x20]\n"
+ "add x26, x26, #0x10\n"
+ "ldr x28, [x12, #0x28]\n"
+ "add x24, x24, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x19, [x12, #0x38]\n"
+ "sub x10, x10, #0x10\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr d7, [x12, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "cmp x10, #0x20\n"
+ "ldr d6, [x12, #0x40]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x48]\n"
+ "ldr x19, [x12, #0x58]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr d7, [x12, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0x60]\n"
+ "ldr x28, [x12, #0x68]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr d7, [x12, #0x70]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x12, #0x78]\n"
+ "ldr x28, [x12, #0x88]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr d6, [x12, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr d7, [x12, #0x90]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x12, #0x98]\n"
+ "ldr x28, [x12, #0xa8]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr d6, [x12, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr d7, [x12, #0xb0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x12, #0xb8]\n"
+ "ldr x28, [x12, #0xc8]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr d6, [x12, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr d7, [x12, #0xd0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x12, #0xd8]\n"
+ "ldr x28, [x12, #0xe8]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr d6, [x12, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr d7, [x12, #0xf0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x12, #0xf8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ "ldr x25, [x26, #0x8]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr d2, [x24, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "ldr x21, [x24, #0x8]\n"
+ "mov v1.d[1], x25\n"
+ "ldr d6, [x12, #0x0]\n"
+ "ldr x28, [x12, #0x8]\n"
+ "mov v2.d[1], x21\n"
+ "mov v6.d[1], x28\n"
+ "bge 61b\n"
+ "62:" // Height 3: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr q7, [x12, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x12, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x12, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q6, [x12, #0x60]\n"
+ "ldr q7, [x12, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q6, [x12, #0x80]\n"
+ "ldr q7, [x12, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q6, [x12, #0xa0]\n"
+ "ldr q7, [x12, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q6, [x12, #0xc0]\n"
+ "ldr q7, [x12, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q6, [x12, #0xe0]\n"
+ "ldr q7, [x12, #0xf0]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "63:" // Height 3: Multiply loop: Main loop skip
+ "cbz x10, 68f\n"
+ "cmp x10, #0x4\n"
+ "blt 65f\n"
+ "64:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "bge 64b\n"
+ "cbz x10, 68f\n"
+ "65:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 66f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "tbz x10, #0, 67f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "b 67f\n"
+ "66:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "67:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "68:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 58b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "ldr q1, [x15, #0x10]\n"
+ "add x23, x13, x19\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr q3, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "tbz %x[flags], #4, 69f\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr q4, [x17, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr q5, [x17, #0x10]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "ldr q7, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "b 70f\n"
+ "69:" // Height 3: per layer parameters
+ "add x25, %x[qp], %[per_layer_right_shift]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v0.4s }, [x25]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "mov v1.16b, v0.16b\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "70:" // Height 3: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "tbz %x[flags], #5, 71f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "71:" // Height 3: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "add x25, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x25]\n"
+ "cmp x14, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "bge 80f\n"
+ "tbz x14, #3, 75f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "tbz x14, #2, 73f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x22], #0x4\n"
+ "tbz x14, #1, 72f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x22], #0x2\n"
+ "tbz x14, #0, 79f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x22]\n"
+ "b 79f\n"
+ "72:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 79f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x22]\n"
+ "b 79f\n"
+ "73:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 74f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x22], #0x2\n"
+ "tbz x14, #0, 79f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x22]\n"
+ "b 79f\n"
+ "74:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 79f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x22]\n"
+ "b 79f\n"
+ "75:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 77f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "str s16, [x22], #0x4\n"
+ "tbz x14, #1, 76f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x22], #0x2\n"
+ "tbz x14, #0, 79f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x22]\n"
+ "b 79f\n"
+ "76:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 79f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x22]\n"
+ "b 79f\n"
+ "77:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 78f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "str h16, [x22], #0x2\n"
+ "tbz x14, #0, 79f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x22]\n"
+ "b 79f\n"
+ "78:" // Height 3: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "str b16, [x22, #0x0]\n"
+ "79:" // Height 3: Partial direct writeback: Done
+ "b 81f\n"
+ "80:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q12, [x23, #0x0]\n"
+ "str q16, [x22, #0x0]\n"
+ "81:" // Height 3: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 56b\n"
+ "b 164f\n"
+ "82:" // Height 4
+ "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x15, %x[col_bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "83:" // Height 4: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "84:" // Height 4: setup done
+ "mov x11, #0x0\n"
+ "85:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 86f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "cbnz x11, 87f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 87f\n"
+ "86:" // Height 4: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "add x23, x24, x19\n"
+ "87:" // Height 4: input setup done
+ "cmp x10, #0x10\n"
+ "blt 90f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q6, [x12, #0x0]\n"
+ "blt 89f\n"
+ "88:" // Height 4: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr d7, [x12, #0x10]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x19, [x12, #0x18]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr d6, [x12, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x28]\n"
+ "ldr x19, [x12, #0x38]\n"
+ "sub x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr d7, [x12, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr d6, [x12, #0x40]\n"
+ "cmp x10, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr x28, [x12, #0x48]\n"
+ "ldr d7, [x12, #0x50]\n"
+ "ldr x19, [x12, #0x58]\n"
+ "mov v6.d[1], x28\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr x28, [x12, #0x68]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0x60]\n"
+ "ldr x19, [x12, #0x78]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr d7, [x12, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr d6, [x12, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr x28, [x12, #0x88]\n"
+ "ldr d7, [x12, #0x90]\n"
+ "ldr x19, [x12, #0x98]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x28, [x12, #0xa8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0xa0]\n"
+ "ldr x19, [x12, #0xb8]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr d7, [x12, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0xc0]\n"
+ "ldr x28, [x12, #0xc8]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x12, #0xd0]\n"
+ "ldr x19, [x12, #0xd8]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0xe0]\n"
+ "ldr x28, [x12, #0xe8]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x12, #0xf0]\n"
+ "ldr x19, [x12, #0xf8]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x12, #0x0]\n"
+ "ldr x28, [x12, #0x8]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "mov v0.d[1], x27\n"
+ "ldr d2, [x24, #0x0]\n"
+ "ldr x21, [x24, #0x8]\n"
+ "mov v1.d[1], x25\n"
+ "ldr d3, [x23, #0x0]\n"
+ "ldr x19, [x23, #0x8]\n"
+ "mov v2.d[1], x21\n"
+ "mov v3.d[1], x19\n"
+ "bge 88b\n"
+ "89:" // Height 4: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q6, [x12, #0x20]\n"
+ "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "ldr q6, [x12, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q7, [x12, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x12, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q7, [x12, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x12, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q7, [x12, #0x90]\n"
+ "ldr q6, [x12, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q7, [x12, #0xb0]\n"
+ "ldr q6, [x12, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q7, [x12, #0xd0]\n"
+ "ldr q6, [x12, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ "ldr q7, [x12, #0xf0]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "90:" // Height 4: Multiply loop: Main loop skip
+ "cbz x10, 95f\n"
+ "cmp x10, #0x4\n"
+ "blt 92f\n"
+ "91:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr s3, [x23], #0x4\n"
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "bge 91b\n"
+ "cbz x10, 95f\n"
+ "92:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 93f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "ldr h3, [x23], #0x2\n"
+ "tbz x10, #0, 94f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v3.b }[2], [x23]\n"
+ "b 94f\n"
+ "93:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "ldr b3, [x23, #0x0]\n"
+ "94:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "95:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 85b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "ldr q1, [x15, #0x10]\n"
+ "add x23, x13, x19\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr q3, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "tbz %x[flags], #4, 96f\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr q4, [x17, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr q5, [x17, #0x10]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "ldr q7, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "b 97f\n"
+ "96:" // Height 4: per layer parameters
+ "add x25, %x[qp], %[per_layer_right_shift]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v0.4s }, [x25]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "mov v1.16b, v0.16b\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "97:" // Height 4: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+ "tbz %x[flags], #5, 98f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v4.16b, v20.16b, v0.16b\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "and v5.16b, v21.16b, v1.16b\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v6.16b, v22.16b, v2.16b\n"
+ "and v7.16b, v23.16b, v3.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "98:" // Height 4: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v22.4s, v22.4s, v2.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "add x25, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x25]\n"
+ "cmp x14, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "bge 107f\n"
+ "tbz x14, #3, 102f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x14, #2, 100f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "tbz x14, #1, 99f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "tbz x14, #0, 106f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "b 106f\n"
+ "99:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 106f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "b 106f\n"
+ "100:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 101f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "tbz x14, #0, 106f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "b 106f\n"
+ "101:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 106f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "b 106f\n"
+ "102:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 104f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "str s16, [x22], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "tbz x14, #1, 103f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "tbz x14, #0, 106f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "b 106f\n"
+ "103:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 106f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "b 106f\n"
+ "104:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 105f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "str h16, [x22], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "tbz x14, #0, 106f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "b 106f\n"
+ "105:" // Height 4: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "str b16, [x22, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "106:" // Height 4: Partial direct writeback: Done
+ "b 108f\n"
+ "107:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q12, [x23, #0x0]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q20, [x21, #0x0]\n"
+ "108:" // Height 4: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 83b\n"
+ "b 164f\n"
+ "109:" // Height 5
+ "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x15, %x[col_bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "110:" // Height 5: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "111:" // Height 5: setup done
+ "mov x11, #0x0\n"
+ "112:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 113f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x11, 114f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 114f\n"
+ "113:" // Height 5: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "114:" // Height 5: input setup done
+ "cmp x10, #0x10\n"
+ "blt 117f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x12, #0x0]\n"
+ "blt 116f\n"
+ "115:" // Height 5: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr d7, [x12, #0x10]\n"
+ "ldr x19, [x12, #0x18]\n"
+ "add x9, x9, #0x10\n"
+ "ldr d6, [x12, #0x20]\n"
+ "add x26, x26, #0x10\n"
+ "ldr x28, [x12, #0x28]\n"
+ "add x24, x24, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x19, [x12, #0x38]\n"
+ "add x23, x23, #0x10\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr d7, [x12, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "ldr d6, [x12, #0x40]\n"
+ "sub x10, x10, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x48]\n"
+ "ldr x19, [x12, #0x58]\n"
+ "cmp x10, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr d7, [x12, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr d6, [x12, #0x60]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x68]\n"
+ "ldr x19, [x12, #0x78]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr d7, [x12, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr d6, [x12, #0x80]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x88]\n"
+ "ldr x19, [x12, #0x98]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr d7, [x12, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr d6, [x12, #0xa0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0xa8]\n"
+ "ldr x19, [x12, #0xb8]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr d7, [x12, #0xb0]\n"
+ "ldr d6, [x12, #0xc0]\n"
+ "ldr x28, [x12, #0xc8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x12, #0xd8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr d7, [x12, #0xd0]\n"
+ "ldr d6, [x12, #0xe0]\n"
+ "ldr x28, [x12, #0xe8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x12, #0xf8]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ "ldr d7, [x12, #0xf0]\n"
+ "ldr x21, [x24, #0x8]\n"
+ "add x12, x12, #0x100\n"
+ "ldr d6, [x12, #0x0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x8]\n"
+ "ldr x19, [x23, #0x8]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr d2, [x24, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "ldr d3, [x23, #0x0]\n"
+ "mov v1.d[1], x25\n"
+ "ldr d4, [x22, #0x0]\n"
+ "mov v2.d[1], x21\n"
+ "ldr x21, [x22, #0x8]\n"
+ "mov v3.d[1], x19\n"
+ "mov v4.d[1], x21\n"
+ "bge 115b\n"
+ "116:" // Height 5: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x23, x23, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q6, [x12, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q7, [x12, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x12, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q7, [x12, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x12, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q7, [x12, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q6, [x12, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q7, [x12, #0xb0]\n"
+ "ldr q6, [x12, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q7, [x12, #0xd0]\n"
+ "ldr q6, [x12, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ "ldr q7, [x12, #0xf0]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "117:" // Height 5: Multiply loop: Main loop skip
+ "cbz x10, 122f\n"
+ "cmp x10, #0x4\n"
+ "blt 119f\n"
+ "118:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "bge 118b\n"
+ "cbz x10, 122f\n"
+ "119:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 120f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "tbz x10, #0, 121f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v3.b }[2], [x23]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "b 121f\n"
+ "120:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "ldr b3, [x23, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "121:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "122:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 112b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "ldr q1, [x15, #0x10]\n"
+ "add x23, x13, x19\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr q3, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "tbz %x[flags], #4, 123f\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr q4, [x17, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr q5, [x17, #0x10]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "ldr q7, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "b 124f\n"
+ "123:" // Height 5: per layer parameters
+ "add x25, %x[qp], %[per_layer_right_shift]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v0.4s }, [x25]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "mov v1.16b, v0.16b\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "124:" // Height 5: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+ "tbz %x[flags], #5, 125f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v4.16b, v20.16b, v0.16b\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "and v5.16b, v21.16b, v1.16b\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v6.16b, v22.16b, v2.16b\n"
+ "and v7.16b, v23.16b, v3.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v24.16b, v0.16b\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "and v5.16b, v25.16b, v1.16b\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "and v6.16b, v26.16b, v2.16b\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v27.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v4.4s\n"
+ "sqadd v25.4s, v25.4s, v5.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "sqadd v27.4s, v27.4s, v7.4s\n"
+ "125:" // Height 5: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v22.4s, v22.4s, v2.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v2.4s\n"
+ "srshl v27.4s, v27.4s, v3.4s\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "add x25, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x25]\n"
+ "cmp x14, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "bge 134f\n"
+ "tbz x14, #3, 129f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x14, #2, 127f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "tbz x14, #1, 126f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "tbz x14, #0, 133f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "b 133f\n"
+ "126:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 133f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "b 133f\n"
+ "127:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 128f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "tbz x14, #0, 133f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "b 133f\n"
+ "128:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 133f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "b 133f\n"
+ "129:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 131f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "str s16, [x22], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "tbz x14, #1, 130f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x14, #0, 133f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "b 133f\n"
+ "130:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 133f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "b 133f\n"
+ "131:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 132f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "str h16, [x22], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "tbz x14, #0, 133f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "b 133f\n"
+ "132:" // Height 5: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "str b16, [x22, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "133:" // Height 5: Partial direct writeback: Done
+ "b 135f\n"
+ "134:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q12, [x23, #0x0]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "135:" // Height 5: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 110b\n"
+ "b 164f\n"
+ "136:" // Height 6
+ "ldr x17, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x15, %x[col_bias]\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x13, %x[output_ptr]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x20, #0x6\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+ "137:" // Height 6: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "138:" // Height 6: setup done
+ "mov x11, #0x0\n"
+ "139:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 140f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x11, 141f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 141f\n"
+ "140:" // Height 6: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x20, x22, x19\n"
+ "141:" // Height 6: input setup done
+ "cmp x10, #0x10\n"
+ "blt 144f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x12, #0x0]\n"
+ "blt 143f\n"
+ "142:" // Height 6: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr d7, [x12, #0x10]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x19, [x12, #0x18]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr d6, [x12, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x28]\n"
+ "ldr x19, [x12, #0x38]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr d7, [x12, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "ldr d6, [x12, #0x40]\n"
+ "sub x10, x10, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x48]\n"
+ "ldr x19, [x12, #0x58]\n"
+ "cmp x10, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr d7, [x12, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr d6, [x12, #0x60]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x68]\n"
+ "ldr x19, [x12, #0x78]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr d7, [x12, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr d6, [x12, #0x80]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x88]\n"
+ "ldr x19, [x12, #0x98]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr d7, [x12, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr d6, [x12, #0xa0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0xa8]\n"
+ "ldr x19, [x12, #0xb8]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr d7, [x12, #0xb0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr d6, [x12, #0xc0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0xc8]\n"
+ "ldr x19, [x12, #0xd8]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr d7, [x12, #0xd0]\n"
+ "ldr d6, [x12, #0xe0]\n"
+ "ldr x28, [x12, #0xe8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x12, #0xf8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ "ldr d7, [x12, #0xf0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "add x12, x12, #0x100\n"
+ "ldr d6, [x12, #0x0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x12, #0x8]\n"
+ "ldr x21, [x24, #0x8]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr d2, [x24, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "ldr d3, [x23, #0x0]\n"
+ "mov v1.d[1], x25\n"
+ "ldr x19, [x23, #0x8]\n"
+ "mov v2.d[1], x21\n"
+ "ldr d4, [x22, #0x0]\n"
+ "ldr x21, [x22, #0x8]\n"
+ "mov v3.d[1], x19\n"
+ "ldr d5, [x20, #0x0]\n"
+ "ldr x19, [x20, #0x8]\n"
+ "mov v4.d[1], x21\n"
+ "mov v5.d[1], x19\n"
+ "bge 142b\n"
+ "143:" // Height 6: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q6, [x12, #0x20]\n"
+ "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "ldr q6, [x12, #0x40]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q7, [x12, #0x50]\n"
+ "add x20, x20, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x12, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q7, [x12, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x12, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q7, [x12, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q6, [x12, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q7, [x12, #0xb0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr q6, [x12, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q7, [x12, #0xd0]\n"
+ "ldr q6, [x12, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ "ldr q7, [x12, #0xf0]\n"
+ "add x12, x12, #0x100\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "144:" // Height 6: Multiply loop: Main loop skip
+ "cbz x10, 149f\n"
+ "cmp x10, #0x4\n"
+ "blt 146f\n"
+ "145:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "bge 145b\n"
+ "cbz x10, 149f\n"
+ "146:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 147f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "tbz x10, #0, 148f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v3.b }[2], [x23]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 148f\n"
+ "147:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "ldr b3, [x23, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "ldr b5, [x20, #0x0]\n"
+ "148:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x12, #0x0]\n"
+ "ldr q7, [x12, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q6, [x12, #0x20]\n"
+ "ldr q7, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "149:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 139b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "ldr q1, [x15, #0x10]\n"
+ "add x23, x13, x19\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr q3, [x15, #0x30]\n"
+ "add x15, x15, #0x40\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "tbz %x[flags], #4, 150f\n"
+ "ldr q0, [x16, #0x0]\n"
+ "ldr q4, [x17, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr q5, [x17, #0x10]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "ldr q6, [x17, #0x20]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "ldr q7, [x17, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "b 151f\n"
+ "150:" // Height 6: per layer parameters
+ "add x25, %x[qp], %[per_layer_right_shift]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v0.4s }, [x25]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "mov v1.16b, v0.16b\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "151:" // Height 6: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v5.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v6.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v7.4s\n"
+ "tbz %x[flags], #5, 152f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v4.16b, v20.16b, v0.16b\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "and v5.16b, v21.16b, v1.16b\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v6.16b, v22.16b, v2.16b\n"
+ "and v7.16b, v23.16b, v3.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v24.16b, v0.16b\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "and v5.16b, v25.16b, v1.16b\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "and v6.16b, v26.16b, v2.16b\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v27.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v4.4s\n"
+ "and v4.16b, v28.16b, v0.16b\n"
+ "sqadd v25.4s, v25.4s, v5.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v5.16b, v29.16b, v1.16b\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "and v6.16b, v30.16b, v2.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v7.4s\n"
+ "and v7.16b, v31.16b, v3.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v29.4s, v29.4s, v5.4s\n"
+ "sqadd v30.4s, v30.4s, v6.4s\n"
+ "sqadd v31.4s, v31.4s, v7.4s\n"
+ "152:" // Height 6: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v22.4s, v22.4s, v2.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v2.4s\n"
+ "srshl v27.4s, v27.4s, v3.4s\n"
+ "srshl v28.4s, v28.4s, v0.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v2.4s\n"
+ "srshl v31.4s, v31.4s, v3.4s\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "add x25, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x25]\n"
+ "cmp x14, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smin v28.4s, v28.4s, v6.4s\n"
+ "smin v29.4s, v29.4s, v6.4s\n"
+ "smin v30.4s, v30.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v6.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "smax v28.4s, v28.4s, v5.4s\n"
+ "smax v29.4s, v29.4s, v5.4s\n"
+ "smax v30.4s, v30.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v5.4s\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v28.8h, v28.8h, v29.8h\n"
+ "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "bge 161f\n"
+ "tbz x14, #3, 156f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x14, #2, 154f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "st1 { v28.s }[2], [x19], #0x4\n"
+ "tbz x14, #1, 153f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "st1 { v28.h }[6], [x19], #0x2\n"
+ "tbz x14, #0, 160f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "st1 { v28.b }[14], [x19]\n"
+ "b 160f\n"
+ "153:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 160f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "st1 { v28.b }[12], [x19]\n"
+ "b 160f\n"
+ "154:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 155f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "st1 { v28.h }[4], [x19], #0x2\n"
+ "tbz x14, #0, 160f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "st1 { v28.b }[10], [x19]\n"
+ "b 160f\n"
+ "155:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 160f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "st1 { v28.b }[8], [x19]\n"
+ "b 160f\n"
+ "156:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 158f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "str s16, [x22], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "str s28, [x19], #0x4\n"
+ "tbz x14, #1, 157f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "st1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x14, #0, 160f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "st1 { v28.b }[6], [x19]\n"
+ "b 160f\n"
+ "157:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 160f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "st1 { v28.b }[4], [x19]\n"
+ "b 160f\n"
+ "158:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 159f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "str h16, [x22], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "str h28, [x19], #0x2\n"
+ "tbz x14, #0, 160f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "st1 { v28.b }[2], [x19]\n"
+ "b 160f\n"
+ "159:" // Height 6: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "str b16, [x22, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "str b28, [x19, #0x0]\n"
+ "160:" // Height 6: Partial direct writeback: Done
+ "b 162f\n"
+ "161:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "str q12, [x23, #0x0]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q28, [x19, #0x0]\n"
+ "162:" // Height 6: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 137b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 164f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 163f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "163:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "164:" // Exit
+
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
index 9847e6553b..5a4df161aa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -88,210 +88,204 @@ void a64_hybrid_s8qs_dot_6x16 (
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 141f\n"
+ "bge 136f\n"
"cmp %x[M], #0x4\n"
- "bgt 113f\n"
- "beq 85f\n"
+ "bgt 109f\n"
+ "beq 82f\n"
"cmp %x[M], #0x2\n"
- "bgt 57f\n"
- "beq 29f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "bgt 55f\n"
+ "beq 28f\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "2:" // Height 1: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
"movi v11.4s, #0x0\n"
- "4:" // Height 1: setup done
- "mov x12, #0x0\n"
- "5:" // Height 1: String loop
+ "3:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 6f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 5f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 7f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 6f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "b 7f\n"
- "6:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "7:" // Height 1: input setup done
- "cmp x11, #0x10\n"
- "blt 10f\n"
- "cmp x11, #0x20\n"
+ "add x25, x25, x19\n"
+ "b 6f\n"
+ "5:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "6:" // Height 1: input setup done
+ "cmp x26, #0x10\n"
"blt 9f\n"
- "8:" // Height 1: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q6, [x28, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "blt 8f\n"
+ "7:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x28, #0x30]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q6, [x28, #0x40]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
- "sub x11, x11, #0x10\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
- "cmp x11, #0x20\n"
+ "ldr q7, [x28, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- "bge 8b\n"
- "9:" // Height 1: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "bge 7b\n"
+ "8:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x28, #0x30]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- "10:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 15f\n"
- "cmp x11, #0x4\n"
- "blt 12f\n"
- "11:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "9:" // Height 1: Multiply loop: Main loop skip
+ "cbz x26, 14f\n"
+ "cmp x26, #0x4\n"
+ "blt 11f\n"
+ "10:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "cmp x26, #0x4\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "sub x11, x11, #0x4\n"
- "add x14, x14, #0x40\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "cmp x11, #0x4\n"
- "bge 11b\n"
- "cbz x11, 15f\n"
- "12:" // Height 1: Multiply loop: Skip odd blocks
- "tbz x11, #1, 13f\n"
- "ldr h0, [x10], #0x2\n"
- "tbz x11, #0, 14f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "b 14f\n"
- "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "14:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 10b\n"
+ "cbz x26, 14f\n"
+ "11:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 12f\n"
+ "ldr h0, [x25], #0x2\n"
+ "tbz x26, #0, 13f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "b 13f\n"
+ "12:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "13:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "15:" // Height 1: Multiply loop: No odd multiplies
+ "14:" // Height 1: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 5b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "ldr q0, [x16, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 4b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x11, #0x0]\n"
"add v8.4s, v8.4s, v0.4s\n"
- "ldr q1, [x16, #0x10]\n"
- "ldr q2, [x16, #0x20]\n"
+ "ldr q1, [x11, #0x10]\n"
+ "ldr q2, [x11, #0x20]\n"
"add v9.4s, v9.4s, v1.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q3, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
"add v10.4s, v10.4s, v2.4s\n"
- "add x16, x16, #0x40\n"
"add v11.4s, v11.4s, v3.4s\n"
- "tbz %x[flags], #4, 16f\n"
- "ldr q0, [x17, #0x0]\n"
- "ldr q4, [x8, #0x0]\n"
- "ldr q1, [x17, #0x10]\n"
- "ldr q5, [x8, #0x10]\n"
- "ldr q2, [x17, #0x20]\n"
- "ldr q6, [x8, #0x20]\n"
- "ldr q3, [x17, #0x30]\n"
- "ldr q7, [x8, #0x30]\n"
- "add x17, x17, #0x40\n"
- "add x8, x8, #0x40\n"
- "b 17f\n"
- "16:" // Height 1: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x19]\n"
+ "tbz %x[flags], #4, 15f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q1, [x12, #0x10]\n"
+ "ldr q5, [x13, #0x10]\n"
+ "ldr q2, [x12, #0x20]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q3, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "b 16f\n"
+ "15:" // Height 1: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x24]\n"
"mov v1.16b, v0.16b\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x24]\n"
"mov v2.16b, v0.16b\n"
"mov v3.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v6.16b, v4.16b\n"
"mov v7.16b, v4.16b\n"
- "17:" // Height 1: parameters loaded
+ "16:" // Height 1: parameters loaded
"sqrdmulh v8.4s, v8.4s, v4.4s\n"
"sqrdmulh v9.4s, v9.4s, v5.4s\n"
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
"sqrdmulh v11.4s, v11.4s, v7.4s\n"
- "tbz %x[flags], #5, 18f\n"
+ "tbz %x[flags], #5, 17f\n"
"and v4.16b, v8.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v9.16b, v1.16b\n"
@@ -304,18 +298,18 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v9.4s, v9.4s, v5.4s\n"
"sqadd v10.4s, v10.4s, v6.4s\n"
"sqadd v11.4s, v11.4s, v7.4s\n"
- "18:" // Height 1: no shift correction
+ "17:" // Height 1: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x24]\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x15, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "cmp x10, #0x10\n"
"add v8.4s, v8.4s, v4.4s\n"
"add v9.4s, v9.4s, v4.4s\n"
"add v10.4s, v10.4s, v4.4s\n"
@@ -331,76 +325,67 @@ void a64_hybrid_s8qs_dot_6x16 (
"smax v11.4s, v11.4s, v5.4s\n"
"uzp1 v9.8h, v10.8h, v11.8h\n"
"uzp1 v8.16b, v8.16b, v9.16b\n"
- "bge 27f\n"
- "tbz x15, #3, 22f\n"
- "str d8, [x13], #0x8\n"
- "tbz x15, #2, 20f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "tbz x15, #1, 19f\n"
- "st1 { v8.h }[6], [x13], #0x2\n"
- "tbz x15, #0, 26f\n"
- "st1 { v8.b }[14], [x13]\n"
- "b 26f\n"
- "19:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x15, #0, 26f\n"
- "st1 { v8.b }[12], [x13]\n"
- "b 26f\n"
- "20:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x15, #1, 21f\n"
- "st1 { v8.h }[4], [x13], #0x2\n"
- "tbz x15, #0, 26f\n"
- "st1 { v8.b }[10], [x13]\n"
- "b 26f\n"
- "21:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x15, #0, 26f\n"
- "st1 { v8.b }[8], [x13]\n"
- "b 26f\n"
- "22:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x15, #2, 24f\n"
- "str s8, [x13], #0x4\n"
- "tbz x15, #1, 23f\n"
- "st1 { v8.h }[2], [x13], #0x2\n"
- "tbz x15, #0, 26f\n"
- "st1 { v8.b }[6], [x13]\n"
- "b 26f\n"
- "23:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x15, #0, 26f\n"
- "st1 { v8.b }[4], [x13]\n"
- "b 26f\n"
- "24:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x15, #1, 25f\n"
- "str h8, [x13], #0x2\n"
- "tbz x15, #0, 26f\n"
- "st1 { v8.b }[2], [x13]\n"
- "b 26f\n"
- "25:" // Height 1: Partial direct writeback: partial_1_0
- "str b8, [x13, #0x0]\n"
- "26:" // Height 1: Partial direct writeback: Done
- "b 28f\n"
- "27:" // Height 1: Full writeback
- "str q8, [x13, #0x0]\n"
- "add x13, x13, #0x10\n"
- "28:" // Height 1: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 3b\n"
- "b 170f\n"
- "29:" // Height 2
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 30f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "b 31f\n"
- "30:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "31:" // Height 2: Column loop
+ "bge 26f\n"
+ "tbz x10, #3, 21f\n"
+ "str d8, [x9], #0x8\n"
+ "tbz x10, #2, 19f\n"
+ "st1 { v8.s }[2], [x9], #0x4\n"
+ "tbz x10, #1, 18f\n"
+ "st1 { v8.h }[6], [x9], #0x2\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[14], [x9]\n"
+ "b 25f\n"
+ "18:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[12], [x9]\n"
+ "b 25f\n"
+ "19:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 20f\n"
+ "st1 { v8.h }[4], [x9], #0x2\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[10], [x9]\n"
+ "b 25f\n"
+ "20:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[8], [x9]\n"
+ "b 25f\n"
+ "21:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 23f\n"
+ "str s8, [x9], #0x4\n"
+ "tbz x10, #1, 22f\n"
+ "st1 { v8.h }[2], [x9], #0x2\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[6], [x9]\n"
+ "b 25f\n"
+ "22:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[4], [x9]\n"
+ "b 25f\n"
+ "23:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 24f\n"
+ "str h8, [x9], #0x2\n"
+ "tbz x10, #0, 25f\n"
+ "st1 { v8.b }[2], [x9]\n"
+ "b 25f\n"
+ "24:" // Height 1: Partial direct writeback: partial_1_0
+ "str b8, [x9, #0x0]\n"
+ "25:" // Height 1: Partial direct writeback: Done
+ "b 27f\n"
+ "26:" // Height 1: Full writeback
+ "str q8, [x9, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "27:" // Height 1: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 2b\n"
+ "b 164f\n"
+ "28:" // Height 2
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "29:" // Height 2: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -409,239 +394,241 @@ void a64_hybrid_s8qs_dot_6x16 (
"movi v13.4s, #0x0\n"
"movi v14.4s, #0x0\n"
"movi v15.4s, #0x0\n"
- "32:" // Height 2: setup done
- "mov x12, #0x0\n"
- "33:" // Height 2: String loop
+ "30:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "31:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 34f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 32f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 35f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 33f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "b 35f\n"
- "34:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "35:" // Height 2: input setup done
- "cmp x11, #0x10\n"
- "blt 38f\n"
- "cmp x11, #0x20\n"
- "blt 37f\n"
- "36:" // Height 2: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "b 33f\n"
+ "32:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "33:" // Height 2: input setup done
+ "cmp x26, #0x10\n"
+ "blt 36f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q6, [x28, #0x0]\n"
+ "blt 35f\n"
+ "34:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
- "cmp x11, #0x20\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "bge 36b\n"
- "37:" // Height 2: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "bge 34b\n"
+ "35:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "38:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 43f\n"
- "cmp x11, #0x4\n"
- "blt 40f\n"
- "39:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "36:" // Height 2: Multiply loop: Main loop skip
+ "cbz x26, 41f\n"
+ "cmp x26, #0x4\n"
+ "blt 38f\n"
+ "37:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "sub x11, x11, #0x4\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "bge 39b\n"
- "cbz x11, 43f\n"
- "40:" // Height 2: Multiply loop: Skip odd blocks
- "tbz x11, #1, 41f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "tbz x11, #0, 42f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "b 42f\n"
- "41:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "42:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 37b\n"
+ "cbz x26, 41f\n"
+ "38:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 39f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "tbz x26, #0, 40f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "b 40f\n"
+ "39:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "40:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "43:" // Height 2: Multiply loop: No odd multiplies
+ "41:" // Height 2: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 33b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 31b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "ldr q0, [x16, #0x0]\n"
+ "add x23, x9, x19\n"
+ "ldr q0, [x11, #0x0]\n"
"add v8.4s, v8.4s, v0.4s\n"
- "ldr q1, [x16, #0x10]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
"add v12.4s, v12.4s, v0.4s\n"
- "ldr q2, [x16, #0x20]\n"
- "ldr q3, [x16, #0x30]\n"
+ "ldr q1, [x11, #0x10]\n"
+ "ldr q2, [x11, #0x20]\n"
"add v9.4s, v9.4s, v1.4s\n"
- "add x16, x16, #0x40\n"
- "add v13.4s, v13.4s, v1.4s\n"
+ "ldr q3, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
"add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
"add v14.4s, v14.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
"add v15.4s, v15.4s, v3.4s\n"
- "tbz %x[flags], #4, 44f\n"
- "ldr q0, [x17, #0x0]\n"
- "ldr q4, [x8, #0x0]\n"
- "ldr q1, [x17, #0x10]\n"
- "ldr q5, [x8, #0x10]\n"
- "ldr q2, [x17, #0x20]\n"
- "ldr q6, [x8, #0x20]\n"
- "ldr q3, [x17, #0x30]\n"
- "ldr q7, [x8, #0x30]\n"
- "add x17, x17, #0x40\n"
- "add x8, x8, #0x40\n"
- "b 45f\n"
- "44:" // Height 2: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x19]\n"
+ "tbz %x[flags], #4, 42f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q1, [x12, #0x10]\n"
+ "ldr q5, [x13, #0x10]\n"
+ "ldr q2, [x12, #0x20]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q3, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "b 43f\n"
+ "42:" // Height 2: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x24]\n"
"mov v1.16b, v0.16b\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x24]\n"
"mov v2.16b, v0.16b\n"
"mov v3.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v6.16b, v4.16b\n"
"mov v7.16b, v4.16b\n"
- "45:" // Height 2: parameters loaded
+ "43:" // Height 2: parameters loaded
"sqrdmulh v8.4s, v8.4s, v4.4s\n"
"sqrdmulh v9.4s, v9.4s, v5.4s\n"
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
@@ -650,7 +637,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v13.4s, v13.4s, v5.4s\n"
"sqrdmulh v14.4s, v14.4s, v6.4s\n"
"sqrdmulh v15.4s, v15.4s, v7.4s\n"
- "tbz %x[flags], #5, 46f\n"
+ "tbz %x[flags], #5, 44f\n"
"and v4.16b, v8.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v9.16b, v1.16b\n"
@@ -675,18 +662,18 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v13.4s, v13.4s, v5.4s\n"
"sqadd v14.4s, v14.4s, v6.4s\n"
"sqadd v15.4s, v15.4s, v7.4s\n"
- "46:" // Height 2: no shift correction
+ "44:" // Height 2: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x24]\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x15, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "cmp x10, #0x10\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"srshl v14.4s, v14.4s, v2.4s\n"
@@ -721,96 +708,83 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v8.16b, v8.16b, v9.16b\n"
"uzp1 v13.8h, v14.8h, v15.8h\n"
"uzp1 v12.16b, v12.16b, v13.16b\n"
- "bge 55f\n"
- "tbz x15, #3, 50f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "tbz x15, #2, 48f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "tbz x15, #1, 47f\n"
- "st1 { v8.h }[6], [x13], #0x2\n"
- "st1 { v12.h }[6], [x9], #0x2\n"
- "tbz x15, #0, 54f\n"
- "st1 { v8.b }[14], [x13]\n"
- "st1 { v12.b }[14], [x9]\n"
- "b 54f\n"
- "47:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x15, #0, 54f\n"
- "st1 { v8.b }[12], [x13]\n"
- "st1 { v12.b }[12], [x9]\n"
- "b 54f\n"
- "48:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x15, #1, 49f\n"
- "st1 { v8.h }[4], [x13], #0x2\n"
- "st1 { v12.h }[4], [x9], #0x2\n"
- "tbz x15, #0, 54f\n"
- "st1 { v8.b }[10], [x13]\n"
- "st1 { v12.b }[10], [x9]\n"
- "b 54f\n"
- "49:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x15, #0, 54f\n"
- "st1 { v8.b }[8], [x13]\n"
- "st1 { v12.b }[8], [x9]\n"
- "b 54f\n"
- "50:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x15, #2, 52f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "tbz x15, #1, 51f\n"
- "st1 { v8.h }[2], [x13], #0x2\n"
- "st1 { v12.h }[2], [x9], #0x2\n"
- "tbz x15, #0, 54f\n"
- "st1 { v8.b }[6], [x13]\n"
- "st1 { v12.b }[6], [x9]\n"
- "b 54f\n"
- "51:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x15, #0, 54f\n"
- "st1 { v8.b }[4], [x13]\n"
- "st1 { v12.b }[4], [x9]\n"
+ "bge 53f\n"
+ "tbz x10, #3, 48f\n"
+ "str d8, [x9], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "tbz x10, #2, 46f\n"
+ "st1 { v8.s }[2], [x9], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "tbz x10, #1, 45f\n"
+ "st1 { v8.h }[6], [x9], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[14], [x9]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "b 52f\n"
+ "45:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[12], [x9]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "b 52f\n"
+ "46:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 47f\n"
+ "st1 { v8.h }[4], [x9], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[10], [x9]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "b 52f\n"
+ "47:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[8], [x9]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "b 52f\n"
+ "48:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 50f\n"
+ "str s8, [x9], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "tbz x10, #1, 49f\n"
+ "st1 { v8.h }[2], [x9], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[6], [x9]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "b 52f\n"
+ "49:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[4], [x9]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "b 52f\n"
+ "50:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 51f\n"
+ "str h8, [x9], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "tbz x10, #0, 52f\n"
+ "st1 { v8.b }[2], [x9]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "b 52f\n"
+ "51:" // Height 2: Partial direct writeback: partial_1_0
+ "str b8, [x9, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "52:" // Height 2: Partial direct writeback: Done
"b 54f\n"
- "52:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x15, #1, 53f\n"
- "str h8, [x13], #0x2\n"
- "str h12, [x9], #0x2\n"
- "tbz x15, #0, 54f\n"
- "st1 { v8.b }[2], [x13]\n"
- "st1 { v12.b }[2], [x9]\n"
- "b 54f\n"
- "53:" // Height 2: Partial direct writeback: partial_1_0
- "str b8, [x13, #0x0]\n"
- "str b12, [x9, #0x0]\n"
- "54:" // Height 2: Partial direct writeback: Done
- "b 56f\n"
- "55:" // Height 2: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q12, [x9, #0x0]\n"
- "add x13, x13, #0x10\n"
+ "53:" // Height 2: Full writeback
+ "str q8, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "56:" // Height 2: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 31b\n"
- "b 170f\n"
- "57:" // Height 3
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 58f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19\n"
- "add x27, x27, x19\n"
- "b 59f\n"
- "58:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "add x27, x9, x19\n"
- "59:" // Height 3: Column loop
+ "str q12, [x23, #0x0]\n"
+ "54:" // Height 2: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 29b\n"
+ "b 164f\n"
+ "55:" // Height 3
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "56:" // Height 3: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -823,297 +797,300 @@ void a64_hybrid_s8qs_dot_6x16 (
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
"movi v19.4s, #0x0\n"
- "60:" // Height 3: setup done
- "mov x12, #0x0\n"
- "61:" // Height 3: String loop
+ "57:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "58:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 62f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 59f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 63f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 60f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
- "b 63f\n"
- "62:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "63:" // Height 3: input setup done
- "cmp x11, #0x10\n"
- "blt 66f\n"
- "cmp x11, #0x20\n"
- "blt 65f\n"
- "64:" // Height 3: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 60f\n"
+ "59:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "60:" // Height 3: input setup done
+ "cmp x26, #0x10\n"
+ "blt 63f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q6, [x28, #0x0]\n"
+ "blt 62f\n"
+ "61:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "bge 64b\n"
- "65:" // Height 3: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "bge 61b\n"
+ "62:" // Height 3: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "66:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 71f\n"
- "cmp x11, #0x4\n"
- "blt 68f\n"
- "67:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "63:" // Height 3: Multiply loop: Main loop skip
+ "cbz x26, 68f\n"
+ "cmp x26, #0x4\n"
+ "blt 65f\n"
+ "64:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "cmp x11, #0x4\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "bge 67b\n"
- "cbz x11, 71f\n"
- "68:" // Height 3: Multiply loop: Skip odd blocks
- "tbz x11, #1, 69f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "tbz x11, #0, 70f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "b 70f\n"
- "69:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "70:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 64b\n"
+ "cbz x26, 68f\n"
+ "65:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 66f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "tbz x26, #0, 67f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "b 67f\n"
+ "66:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "67:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "71:" // Height 3: Multiply loop: No odd multiplies
+ "68:" // Height 3: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 61b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 58b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "ldr q0, [x16, #0x0]\n"
+ "add x23, x9, x19\n"
+ "ldr q0, [x11, #0x0]\n"
"add v8.4s, v8.4s, v0.4s\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "ldr q1, [x16, #0x10]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19\n"
"add v12.4s, v12.4s, v0.4s\n"
- "ldr q2, [x16, #0x20]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
"add v16.4s, v16.4s, v0.4s\n"
- "ldr q3, [x16, #0x30]\n"
- "add x16, x16, #0x40\n"
+ "ldr q1, [x11, #0x10]\n"
+ "ldr q2, [x11, #0x20]\n"
"add v9.4s, v9.4s, v1.4s\n"
- "add v13.4s, v13.4s, v1.4s\n"
+ "ldr q3, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
"add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
"add v14.4s, v14.4s, v2.4s\n"
- "add v15.4s, v15.4s, v3.4s\n"
"add v17.4s, v17.4s, v1.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
"add v18.4s, v18.4s, v2.4s\n"
"add v19.4s, v19.4s, v3.4s\n"
- "tbz %x[flags], #4, 72f\n"
- "ldr q0, [x17, #0x0]\n"
- "ldr q4, [x8, #0x0]\n"
- "ldr q1, [x17, #0x10]\n"
- "ldr q5, [x8, #0x10]\n"
- "ldr q2, [x17, #0x20]\n"
- "ldr q6, [x8, #0x20]\n"
- "ldr q3, [x17, #0x30]\n"
- "ldr q7, [x8, #0x30]\n"
- "add x17, x17, #0x40\n"
- "add x8, x8, #0x40\n"
- "b 73f\n"
- "72:" // Height 3: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x19]\n"
+ "tbz %x[flags], #4, 69f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q1, [x12, #0x10]\n"
+ "ldr q5, [x13, #0x10]\n"
+ "ldr q2, [x12, #0x20]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q3, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "b 70f\n"
+ "69:" // Height 3: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x24]\n"
"mov v1.16b, v0.16b\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x24]\n"
"mov v2.16b, v0.16b\n"
"mov v3.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v6.16b, v4.16b\n"
"mov v7.16b, v4.16b\n"
- "73:" // Height 3: parameters loaded
+ "70:" // Height 3: parameters loaded
"sqrdmulh v8.4s, v8.4s, v4.4s\n"
"sqrdmulh v9.4s, v9.4s, v5.4s\n"
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
@@ -1126,7 +1103,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v17.4s, v17.4s, v5.4s\n"
"sqrdmulh v18.4s, v18.4s, v6.4s\n"
"sqrdmulh v19.4s, v19.4s, v7.4s\n"
- "tbz %x[flags], #5, 74f\n"
+ "tbz %x[flags], #5, 71f\n"
"and v4.16b, v8.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v9.16b, v1.16b\n"
@@ -1163,18 +1140,18 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v17.4s, v17.4s, v5.4s\n"
"sqadd v18.4s, v18.4s, v6.4s\n"
"sqadd v19.4s, v19.4s, v7.4s\n"
- "74:" // Height 3: no shift correction
+ "71:" // Height 3: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x24]\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x15, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "cmp x10, #0x10\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"srshl v14.4s, v14.4s, v2.4s\n"
@@ -1228,116 +1205,99 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v8.16b, v8.16b, v9.16b\n"
"uzp1 v12.16b, v12.16b, v13.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
- "bge 83f\n"
- "tbz x15, #3, 78f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "tbz x15, #2, 76f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "tbz x15, #1, 75f\n"
- "st1 { v8.h }[6], [x13], #0x2\n"
- "st1 { v12.h }[6], [x9], #0x2\n"
- "st1 { v16.h }[6], [x27], #0x2\n"
- "tbz x15, #0, 82f\n"
- "st1 { v8.b }[14], [x13]\n"
- "st1 { v12.b }[14], [x9]\n"
- "st1 { v16.b }[14], [x27]\n"
- "b 82f\n"
- "75:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x15, #0, 82f\n"
- "st1 { v8.b }[12], [x13]\n"
- "st1 { v12.b }[12], [x9]\n"
- "st1 { v16.b }[12], [x27]\n"
- "b 82f\n"
- "76:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x15, #1, 77f\n"
- "st1 { v8.h }[4], [x13], #0x2\n"
- "st1 { v12.h }[4], [x9], #0x2\n"
- "st1 { v16.h }[4], [x27], #0x2\n"
- "tbz x15, #0, 82f\n"
- "st1 { v8.b }[10], [x13]\n"
- "st1 { v12.b }[10], [x9]\n"
- "st1 { v16.b }[10], [x27]\n"
- "b 82f\n"
- "77:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x15, #0, 82f\n"
- "st1 { v8.b }[8], [x13]\n"
- "st1 { v12.b }[8], [x9]\n"
- "st1 { v16.b }[8], [x27]\n"
- "b 82f\n"
- "78:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x15, #2, 80f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "str s16, [x27], #0x4\n"
- "tbz x15, #1, 79f\n"
- "st1 { v8.h }[2], [x13], #0x2\n"
- "st1 { v12.h }[2], [x9], #0x2\n"
- "st1 { v16.h }[2], [x27], #0x2\n"
- "tbz x15, #0, 82f\n"
- "st1 { v8.b }[6], [x13]\n"
- "st1 { v12.b }[6], [x9]\n"
- "st1 { v16.b }[6], [x27]\n"
- "b 82f\n"
- "79:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x15, #0, 82f\n"
- "st1 { v8.b }[4], [x13]\n"
- "st1 { v12.b }[4], [x9]\n"
- "st1 { v16.b }[4], [x27]\n"
- "b 82f\n"
- "80:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x15, #1, 81f\n"
- "str h8, [x13], #0x2\n"
- "str h12, [x9], #0x2\n"
- "str h16, [x27], #0x2\n"
- "tbz x15, #0, 82f\n"
- "st1 { v8.b }[2], [x13]\n"
- "st1 { v12.b }[2], [x9]\n"
- "st1 { v16.b }[2], [x27]\n"
- "b 82f\n"
- "81:" // Height 3: Partial direct writeback: partial_1_0
- "str b8, [x13, #0x0]\n"
- "str b12, [x9, #0x0]\n"
- "str b16, [x27, #0x0]\n"
- "82:" // Height 3: Partial direct writeback: Done
- "b 84f\n"
- "83:" // Height 3: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q12, [x9, #0x0]\n"
- "str q16, [x27, #0x0]\n"
- "add x13, x13, #0x10\n"
+ "bge 80f\n"
+ "tbz x10, #3, 75f\n"
+ "str d8, [x9], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "tbz x10, #2, 73f\n"
+ "st1 { v8.s }[2], [x9], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x22], #0x4\n"
+ "tbz x10, #1, 72f\n"
+ "st1 { v8.h }[6], [x9], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x22], #0x2\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[14], [x9]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x22]\n"
+ "b 79f\n"
+ "72:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[12], [x9]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x22]\n"
+ "b 79f\n"
+ "73:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 74f\n"
+ "st1 { v8.h }[4], [x9], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x22], #0x2\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[10], [x9]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x22]\n"
+ "b 79f\n"
+ "74:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[8], [x9]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x22]\n"
+ "b 79f\n"
+ "75:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 77f\n"
+ "str s8, [x9], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "str s16, [x22], #0x4\n"
+ "tbz x10, #1, 76f\n"
+ "st1 { v8.h }[2], [x9], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x22], #0x2\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[6], [x9]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x22]\n"
+ "b 79f\n"
+ "76:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[4], [x9]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x22]\n"
+ "b 79f\n"
+ "77:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 78f\n"
+ "str h8, [x9], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "str h16, [x22], #0x2\n"
+ "tbz x10, #0, 79f\n"
+ "st1 { v8.b }[2], [x9]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x22]\n"
+ "b 79f\n"
+ "78:" // Height 3: Partial direct writeback: partial_1_0
+ "str b8, [x9, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "str b16, [x22, #0x0]\n"
+ "79:" // Height 3: Partial direct writeback: Done
+ "b 81f\n"
+ "80:" // Height 3: Full writeback
+ "str q8, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
- "84:" // Height 3: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 59b\n"
- "b 170f\n"
- "85:" // Height 4
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 86f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19\n"
- "add x25, x25, x19\n"
- "b 87f\n"
- "86:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "add x27, x9, x19\n"
- "add x25, x27, x19\n"
- "87:" // Height 4: Column loop
+ "str q12, [x23, #0x0]\n"
+ "str q16, [x22, #0x0]\n"
+ "81:" // Height 3: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 56b\n"
+ "b 164f\n"
+ "82:" // Height 4
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "83:" // Height 4: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -1354,220 +1314,220 @@ void a64_hybrid_s8qs_dot_6x16 (
"movi v21.4s, #0x0\n"
"movi v22.4s, #0x0\n"
"movi v23.4s, #0x0\n"
- "88:" // Height 4: setup done
- "mov x12, #0x0\n"
- "89:" // Height 4: String loop
+ "84:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "85:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 90f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 86f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 91f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 87f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
- "b 91f\n"
- "90:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "91:" // Height 4: input setup done
- "cmp x11, #0x10\n"
- "blt 94f\n"
- "cmp x11, #0x20\n"
- "blt 93f\n"
- "92:" // Height 4: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 87f\n"
+ "86:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "87:" // Height 4: input setup done
+ "cmp x26, #0x10\n"
+ "blt 90f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q6, [x28, #0x0]\n"
+ "blt 89f\n"
+ "88:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x11, x11, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "bge 92b\n"
- "93:" // Height 4: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "bge 88b\n"
+ "89:" // Height 4: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
@@ -1576,31 +1536,31 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "94:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 99f\n"
- "cmp x11, #0x4\n"
- "blt 96f\n"
- "95:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "90:" // Height 4: Multiply loop: Main loop skip
+ "cbz x26, 95f\n"
+ "cmp x26, #0x4\n"
+ "blt 92f\n"
+ "91:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -1608,40 +1568,40 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "bge 95b\n"
- "cbz x11, 99f\n"
- "96:" // Height 4: Multiply loop: Skip odd blocks
- "tbz x11, #1, 97f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "tbz x11, #0, 98f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "b 98f\n"
- "97:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "98:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 91b\n"
+ "cbz x26, 95f\n"
+ "92:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 93f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "tbz x26, #0, 94f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "ld1 { v3.b }[2], [x22]\n"
+ "b 94f\n"
+ "93:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "ldr b3, [x22, #0x0]\n"
+ "94:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -1649,60 +1609,64 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "99:" // Height 4: Multiply loop: No odd multiplies
+ "95:" // Height 4: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 89b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 85b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "ldr q0, [x16, #0x0]\n"
+ "add x23, x9, x19\n"
+ "ldr q0, [x11, #0x0]\n"
"add v8.4s, v8.4s, v0.4s\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "ldr q1, [x16, #0x10]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19\n"
"add v12.4s, v12.4s, v0.4s\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "ldr q2, [x16, #0x20]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19\n"
"add v16.4s, v16.4s, v0.4s\n"
- "ldr q3, [x16, #0x30]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
"add v20.4s, v20.4s, v0.4s\n"
- "add x16, x16, #0x40\n"
+ "ldr q1, [x11, #0x10]\n"
+ "ldr q2, [x11, #0x20]\n"
"add v9.4s, v9.4s, v1.4s\n"
- "add v13.4s, v13.4s, v1.4s\n"
+ "ldr q3, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
"add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
"add v14.4s, v14.4s, v2.4s\n"
- "add v15.4s, v15.4s, v3.4s\n"
"add v17.4s, v17.4s, v1.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
"add v18.4s, v18.4s, v2.4s\n"
"add v19.4s, v19.4s, v3.4s\n"
"add v21.4s, v21.4s, v1.4s\n"
"add v22.4s, v22.4s, v2.4s\n"
"add v23.4s, v23.4s, v3.4s\n"
- "tbz %x[flags], #4, 100f\n"
- "ldr q0, [x17, #0x0]\n"
- "ldr q4, [x8, #0x0]\n"
- "ldr q1, [x17, #0x10]\n"
- "ldr q5, [x8, #0x10]\n"
- "ldr q2, [x17, #0x20]\n"
- "ldr q6, [x8, #0x20]\n"
- "ldr q3, [x17, #0x30]\n"
- "ldr q7, [x8, #0x30]\n"
- "add x17, x17, #0x40\n"
- "add x8, x8, #0x40\n"
- "b 101f\n"
- "100:" // Height 4: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x19]\n"
+ "tbz %x[flags], #4, 96f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q1, [x12, #0x10]\n"
+ "ldr q5, [x13, #0x10]\n"
+ "ldr q2, [x12, #0x20]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q3, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "b 97f\n"
+ "96:" // Height 4: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x24]\n"
"mov v1.16b, v0.16b\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x24]\n"
"mov v2.16b, v0.16b\n"
"mov v3.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v6.16b, v4.16b\n"
"mov v7.16b, v4.16b\n"
- "101:" // Height 4: parameters loaded
+ "97:" // Height 4: parameters loaded
"sqrdmulh v8.4s, v8.4s, v4.4s\n"
"sqrdmulh v9.4s, v9.4s, v5.4s\n"
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
@@ -1719,7 +1683,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v21.4s, v21.4s, v5.4s\n"
"sqrdmulh v22.4s, v22.4s, v6.4s\n"
"sqrdmulh v23.4s, v23.4s, v7.4s\n"
- "tbz %x[flags], #5, 102f\n"
+ "tbz %x[flags], #5, 98f\n"
"and v4.16b, v8.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v9.16b, v1.16b\n"
@@ -1768,18 +1732,18 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v21.4s, v21.4s, v5.4s\n"
"sqadd v22.4s, v22.4s, v6.4s\n"
"sqadd v23.4s, v23.4s, v7.4s\n"
- "102:" // Height 4: no shift correction
+ "98:" // Height 4: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x24]\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x15, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "cmp x10, #0x10\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"srshl v14.4s, v14.4s, v2.4s\n"
@@ -1852,136 +1816,115 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v12.16b, v12.16b, v13.16b\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
"uzp1 v20.16b, v20.16b, v21.16b\n"
- "bge 111f\n"
- "tbz x15, #3, 106f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "tbz x15, #2, 104f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "tbz x15, #1, 103f\n"
- "st1 { v8.h }[6], [x13], #0x2\n"
- "st1 { v12.h }[6], [x9], #0x2\n"
- "st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x25], #0x2\n"
- "tbz x15, #0, 110f\n"
- "st1 { v8.b }[14], [x13]\n"
- "st1 { v12.b }[14], [x9]\n"
- "st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x25]\n"
- "b 110f\n"
- "103:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x15, #0, 110f\n"
- "st1 { v8.b }[12], [x13]\n"
- "st1 { v12.b }[12], [x9]\n"
- "st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x25]\n"
- "b 110f\n"
- "104:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x15, #1, 105f\n"
- "st1 { v8.h }[4], [x13], #0x2\n"
- "st1 { v12.h }[4], [x9], #0x2\n"
- "st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x25], #0x2\n"
- "tbz x15, #0, 110f\n"
- "st1 { v8.b }[10], [x13]\n"
- "st1 { v12.b }[10], [x9]\n"
- "st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x25]\n"
- "b 110f\n"
- "105:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x15, #0, 110f\n"
- "st1 { v8.b }[8], [x13]\n"
- "st1 { v12.b }[8], [x9]\n"
- "st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x25]\n"
- "b 110f\n"
- "106:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x15, #2, 108f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "str s16, [x27], #0x4\n"
- "str s20, [x25], #0x4\n"
- "tbz x15, #1, 107f\n"
- "st1 { v8.h }[2], [x13], #0x2\n"
- "st1 { v12.h }[2], [x9], #0x2\n"
- "st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x25], #0x2\n"
- "tbz x15, #0, 110f\n"
- "st1 { v8.b }[6], [x13]\n"
- "st1 { v12.b }[6], [x9]\n"
- "st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x25]\n"
- "b 110f\n"
- "107:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x15, #0, 110f\n"
- "st1 { v8.b }[4], [x13]\n"
- "st1 { v12.b }[4], [x9]\n"
- "st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x25]\n"
- "b 110f\n"
- "108:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x15, #1, 109f\n"
- "str h8, [x13], #0x2\n"
- "str h12, [x9], #0x2\n"
- "str h16, [x27], #0x2\n"
- "str h20, [x25], #0x2\n"
- "tbz x15, #0, 110f\n"
- "st1 { v8.b }[2], [x13]\n"
- "st1 { v12.b }[2], [x9]\n"
- "st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x25]\n"
- "b 110f\n"
- "109:" // Height 4: Partial direct writeback: partial_1_0
- "str b8, [x13, #0x0]\n"
- "str b12, [x9, #0x0]\n"
- "str b16, [x27, #0x0]\n"
- "str b20, [x25, #0x0]\n"
- "110:" // Height 4: Partial direct writeback: Done
- "b 112f\n"
- "111:" // Height 4: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q12, [x9, #0x0]\n"
- "str q16, [x27, #0x0]\n"
- "str q20, [x25, #0x0]\n"
- "add x13, x13, #0x10\n"
+ "bge 107f\n"
+ "tbz x10, #3, 102f\n"
+ "str d8, [x9], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x10, #2, 100f\n"
+ "st1 { v8.s }[2], [x9], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "tbz x10, #1, 99f\n"
+ "st1 { v8.h }[6], [x9], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[14], [x9]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "b 106f\n"
+ "99:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[12], [x9]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "b 106f\n"
+ "100:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 101f\n"
+ "st1 { v8.h }[4], [x9], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[10], [x9]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "b 106f\n"
+ "101:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[8], [x9]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "b 106f\n"
+ "102:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 104f\n"
+ "str s8, [x9], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "str s16, [x22], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "tbz x10, #1, 103f\n"
+ "st1 { v8.h }[2], [x9], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[6], [x9]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "b 106f\n"
+ "103:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[4], [x9]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "b 106f\n"
+ "104:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 105f\n"
+ "str h8, [x9], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "str h16, [x22], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "tbz x10, #0, 106f\n"
+ "st1 { v8.b }[2], [x9]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "b 106f\n"
+ "105:" // Height 4: Partial direct writeback: partial_1_0
+ "str b8, [x9, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "str b16, [x22, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "106:" // Height 4: Partial direct writeback: Done
+ "b 108f\n"
+ "107:" // Height 4: Full writeback
+ "str q8, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "112:" // Height 4: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 87b\n"
- "b 170f\n"
- "113:" // Height 5
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 114f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19\n"
- "add x25, x25, x19\n"
- "add x23, x23, x19\n"
- "b 115f\n"
- "114:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "add x27, x9, x19\n"
- "add x25, x27, x19\n"
- "add x23, x25, x19\n"
- "115:" // Height 5: Column loop
+ "str q12, [x23, #0x0]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q20, [x21, #0x0]\n"
+ "108:" // Height 4: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 83b\n"
+ "b 164f\n"
+ "109:" // Height 5
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "110:" // Height 5: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -2002,260 +1945,260 @@ void a64_hybrid_s8qs_dot_6x16 (
"movi v25.4s, #0x0\n"
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- "116:" // Height 5: setup done
- "mov x12, #0x0\n"
- "117:" // Height 5: String loop
+ "111:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "112:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 118f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 113f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 119f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 114f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 119f\n"
- "118:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "119:" // Height 5: input setup done
- "cmp x11, #0x10\n"
- "blt 122f\n"
- "cmp x11, #0x20\n"
- "blt 121f\n"
- "120:" // Height 5: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x21, x21, x19\n"
+ "b 114f\n"
+ "113:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "114:" // Height 5: input setup done
+ "cmp x26, #0x10\n"
+ "blt 117f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q6, [x28, #0x0]\n"
+ "blt 116f\n"
+ "115:" // Height 5: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "sub x11, x11, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "ldr q3, [x22, #0x0]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
- "bge 120b\n"
- "121:" // Height 5: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "bge 115b\n"
+ "116:" // Height 5: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
@@ -2265,34 +2208,34 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
- "122:" // Height 5: Multiply loop: Main loop skip
- "cbz x11, 127f\n"
- "cmp x11, #0x4\n"
- "blt 124f\n"
- "123:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "117:" // Height 5: Multiply loop: Main loop skip
+ "cbz x26, 122f\n"
+ "cmp x26, #0x4\n"
+ "blt 119f\n"
+ "118:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -2302,45 +2245,45 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "bge 123b\n"
- "cbz x11, 127f\n"
- "124:" // Height 5: Multiply loop: Skip odd blocks
- "tbz x11, #1, 125f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "tbz x11, #0, 126f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x22]\n"
- "b 126f\n"
- "125:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "ldr b4, [x22, #0x0]\n"
- "126:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 118b\n"
+ "cbz x26, 122f\n"
+ "119:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 120f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h4, [x21], #0x2\n"
+ "tbz x26, #0, 121f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "ld1 { v3.b }[2], [x22]\n"
+ "ld1 { v4.b }[2], [x21]\n"
+ "b 121f\n"
+ "120:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "ldr b3, [x22, #0x0]\n"
+ "ldr b4, [x21, #0x0]\n"
+ "121:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -2350,33 +2293,38 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "127:" // Height 5: Multiply loop: No odd multiplies
+ "122:" // Height 5: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 117b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 112b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "ldr q0, [x16, #0x0]\n"
+ "add x23, x9, x19\n"
+ "ldr q0, [x11, #0x0]\n"
"add v8.4s, v8.4s, v0.4s\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "ldr q1, [x16, #0x10]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19\n"
"add v12.4s, v12.4s, v0.4s\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "ldr q2, [x16, #0x20]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19\n"
"add v16.4s, v16.4s, v0.4s\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "ldr q3, [x16, #0x30]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19\n"
"add v20.4s, v20.4s, v0.4s\n"
- "add x16, x16, #0x40\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
"add v24.4s, v24.4s, v0.4s\n"
+ "ldr q1, [x11, #0x10]\n"
+ "ldr q2, [x11, #0x20]\n"
"add v9.4s, v9.4s, v1.4s\n"
- "add v13.4s, v13.4s, v1.4s\n"
+ "ldr q3, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
"add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
"add v14.4s, v14.4s, v2.4s\n"
- "add v15.4s, v15.4s, v3.4s\n"
"add v17.4s, v17.4s, v1.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
"add v18.4s, v18.4s, v2.4s\n"
"add v19.4s, v19.4s, v3.4s\n"
"add v21.4s, v21.4s, v1.4s\n"
@@ -2385,30 +2333,30 @@ void a64_hybrid_s8qs_dot_6x16 (
"add v25.4s, v25.4s, v1.4s\n"
"add v26.4s, v26.4s, v2.4s\n"
"add v27.4s, v27.4s, v3.4s\n"
- "tbz %x[flags], #4, 128f\n"
- "ldr q0, [x17, #0x0]\n"
- "ldr q4, [x8, #0x0]\n"
- "ldr q1, [x17, #0x10]\n"
- "ldr q5, [x8, #0x10]\n"
- "ldr q2, [x17, #0x20]\n"
- "ldr q6, [x8, #0x20]\n"
- "ldr q3, [x17, #0x30]\n"
- "ldr q7, [x8, #0x30]\n"
- "add x17, x17, #0x40\n"
- "add x8, x8, #0x40\n"
- "b 129f\n"
- "128:" // Height 5: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x19]\n"
+ "tbz %x[flags], #4, 123f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q1, [x12, #0x10]\n"
+ "ldr q5, [x13, #0x10]\n"
+ "ldr q2, [x12, #0x20]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q3, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "b 124f\n"
+ "123:" // Height 5: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x24]\n"
"mov v1.16b, v0.16b\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x24]\n"
"mov v2.16b, v0.16b\n"
"mov v3.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v6.16b, v4.16b\n"
"mov v7.16b, v4.16b\n"
- "129:" // Height 5: parameters loaded
+ "124:" // Height 5: parameters loaded
"sqrdmulh v8.4s, v8.4s, v4.4s\n"
"sqrdmulh v9.4s, v9.4s, v5.4s\n"
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
@@ -2429,7 +2377,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v25.4s, v25.4s, v5.4s\n"
"sqrdmulh v26.4s, v26.4s, v6.4s\n"
"sqrdmulh v27.4s, v27.4s, v7.4s\n"
- "tbz %x[flags], #5, 130f\n"
+ "tbz %x[flags], #5, 125f\n"
"and v4.16b, v8.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v9.16b, v1.16b\n"
@@ -2490,18 +2438,18 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v25.4s, v25.4s, v5.4s\n"
"sqadd v26.4s, v26.4s, v6.4s\n"
"sqadd v27.4s, v27.4s, v7.4s\n"
- "130:" // Height 5: no shift correction
+ "125:" // Height 5: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x24]\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x15, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "cmp x10, #0x10\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"srshl v14.4s, v14.4s, v2.4s\n"
@@ -2593,158 +2541,134 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v16.16b, v16.16b, v17.16b\n"
"uzp1 v20.16b, v20.16b, v21.16b\n"
"uzp1 v24.16b, v24.16b, v25.16b\n"
- "bge 139f\n"
- "tbz x15, #3, 134f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x15, #2, 132f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x15, #1, 131f\n"
- "st1 { v8.h }[6], [x13], #0x2\n"
- "st1 { v12.h }[6], [x9], #0x2\n"
- "st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x25], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x15, #0, 138f\n"
- "st1 { v8.b }[14], [x13]\n"
- "st1 { v12.b }[14], [x9]\n"
- "st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x25]\n"
- "st1 { v24.b }[14], [x23]\n"
- "b 138f\n"
- "131:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x15, #0, 138f\n"
- "st1 { v8.b }[12], [x13]\n"
- "st1 { v12.b }[12], [x9]\n"
- "st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x25]\n"
- "st1 { v24.b }[12], [x23]\n"
- "b 138f\n"
- "132:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x15, #1, 133f\n"
- "st1 { v8.h }[4], [x13], #0x2\n"
- "st1 { v12.h }[4], [x9], #0x2\n"
- "st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x25], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x15, #0, 138f\n"
- "st1 { v8.b }[10], [x13]\n"
- "st1 { v12.b }[10], [x9]\n"
- "st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x25]\n"
- "st1 { v24.b }[10], [x23]\n"
- "b 138f\n"
- "133:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x15, #0, 138f\n"
- "st1 { v8.b }[8], [x13]\n"
- "st1 { v12.b }[8], [x9]\n"
- "st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x25]\n"
- "st1 { v24.b }[8], [x23]\n"
- "b 138f\n"
- "134:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x15, #2, 136f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "str s16, [x27], #0x4\n"
- "str s20, [x25], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x15, #1, 135f\n"
- "st1 { v8.h }[2], [x13], #0x2\n"
- "st1 { v12.h }[2], [x9], #0x2\n"
- "st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x25], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x15, #0, 138f\n"
- "st1 { v8.b }[6], [x13]\n"
- "st1 { v12.b }[6], [x9]\n"
- "st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x25]\n"
- "st1 { v24.b }[6], [x23]\n"
- "b 138f\n"
- "135:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x15, #0, 138f\n"
- "st1 { v8.b }[4], [x13]\n"
- "st1 { v12.b }[4], [x9]\n"
- "st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x25]\n"
- "st1 { v24.b }[4], [x23]\n"
- "b 138f\n"
- "136:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x15, #1, 137f\n"
- "str h8, [x13], #0x2\n"
- "str h12, [x9], #0x2\n"
- "str h16, [x27], #0x2\n"
- "str h20, [x25], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x15, #0, 138f\n"
- "st1 { v8.b }[2], [x13]\n"
- "st1 { v12.b }[2], [x9]\n"
- "st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x25]\n"
- "st1 { v24.b }[2], [x23]\n"
- "b 138f\n"
- "137:" // Height 5: Partial direct writeback: partial_1_0
- "str b8, [x13, #0x0]\n"
- "str b12, [x9, #0x0]\n"
- "str b16, [x27, #0x0]\n"
- "str b20, [x25, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "138:" // Height 5: Partial direct writeback: Done
- "b 140f\n"
- "139:" // Height 5: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q12, [x9, #0x0]\n"
- "str q16, [x27, #0x0]\n"
- "str q20, [x25, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "add x13, x13, #0x10\n"
+ "bge 134f\n"
+ "tbz x10, #3, 129f\n"
+ "str d8, [x9], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x10, #2, 127f\n"
+ "st1 { v8.s }[2], [x9], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "tbz x10, #1, 126f\n"
+ "st1 { v8.h }[6], [x9], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[14], [x9]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "b 133f\n"
+ "126:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[12], [x9]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "b 133f\n"
+ "127:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 128f\n"
+ "st1 { v8.h }[4], [x9], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[10], [x9]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "b 133f\n"
+ "128:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[8], [x9]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "b 133f\n"
+ "129:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 131f\n"
+ "str s8, [x9], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "str s16, [x22], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "tbz x10, #1, 130f\n"
+ "st1 { v8.h }[2], [x9], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[6], [x9]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "b 133f\n"
+ "130:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[4], [x9]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "b 133f\n"
+ "131:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 132f\n"
+ "str h8, [x9], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "str h16, [x22], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "tbz x10, #0, 133f\n"
+ "st1 { v8.b }[2], [x9]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "b 133f\n"
+ "132:" // Height 5: Partial direct writeback: partial_1_0
+ "str b8, [x9, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "str b16, [x22, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "133:" // Height 5: Partial direct writeback: Done
+ "b 135f\n"
+ "134:" // Height 5: Full writeback
+ "str q8, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x23, x23, #0x10\n"
- "140:" // Height 5: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 115b\n"
- "b 170f\n"
- "141:" // Height 6
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "str q12, [x23, #0x0]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "135:" // Height 5: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 110b\n"
+ "b 164f\n"
+ "136:" // Height 6
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x20, #0x6\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 142f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19\n"
- "add x23, x23, x19\n"
- "add x21, x21, x19\n"
- "b 143f\n"
- "142:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "add x27, x9, x19\n"
- "add x25, x27, x19\n"
- "add x23, x25, x19\n"
- "add x21, x23, x19\n"
- "add %x[output_ptr], x21, x19\n"
- "143:" // Height 6: Column loop
+ "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+ "137:" // Height 6: Column loop
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -2769,299 +2693,299 @@ void a64_hybrid_s8qs_dot_6x16 (
"movi v29.4s, #0x0\n"
"movi v30.4s, #0x0\n"
"movi v31.4s, #0x0\n"
- "144:" // Height 6: setup done
- "mov x12, #0x0\n"
- "145:" // Height 6: String loop
+ "138:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "139:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 146f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 140f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 147f\n"
+ "cbnz x27, 141f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 147f\n"
- "146:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "147:" // Height 6: input setup done
- "cmp x11, #0x10\n"
- "blt 150f\n"
- "cmp x11, #0x20\n"
- "blt 149f\n"
- "148:" // Height 6: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
+ "b 141f\n"
+ "140:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "141:" // Height 6: input setup done
+ "cmp x26, #0x10\n"
+ "blt 144f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q6, [x28, #0x0]\n"
+ "blt 143f\n"
+ "142:" // Height 6: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x20, x20, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x20, x20, #0x10\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "sub x11, x11, #0x10\n"
+ "cmp x26, #0x20\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "ldr q3, [x22, #0x0]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "ldr q4, [x21, #0x0]\n"
".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
- "bge 148b\n"
- "149:" // Height 6: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "bge 142b\n"
+ "143:" // Height 6: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"add x22, x22, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x28, #0x20]\n"
"add x20, x20, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x28, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x28, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x28, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x28, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x28, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x28, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x28, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x28, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x28, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x28, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x28, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
@@ -3073,37 +2997,37 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
- "150:" // Height 6: Multiply loop: Main loop skip
- "cbz x11, 155f\n"
- "cmp x11, #0x4\n"
- "blt 152f\n"
- "151:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
+ "144:" // Height 6: Multiply loop: Main loop skip
+ "cbz x26, 149f\n"
+ "cmp x26, #0x4\n"
+ "blt 146f\n"
+ "145:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
"ldr s5, [x20], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -3115,50 +3039,50 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "bge 151b\n"
- "cbz x11, 155f\n"
- "152:" // Height 6: Multiply loop: Skip odd blocks
- "tbz x11, #1, 153f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x22], #0x2\n"
+ "bge 145b\n"
+ "cbz x26, 149f\n"
+ "146:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 147f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h4, [x21], #0x2\n"
"ldr h5, [x20], #0x2\n"
- "tbz x11, #0, 154f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x22]\n"
+ "tbz x26, #0, 148f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "ld1 { v3.b }[2], [x22]\n"
+ "ld1 { v4.b }[2], [x21]\n"
"ld1 { v5.b }[2], [x20]\n"
- "b 154f\n"
- "153:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "ldr b4, [x22, #0x0]\n"
+ "b 148f\n"
+ "147:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "ldr b3, [x22, #0x0]\n"
+ "ldr b4, [x21, #0x0]\n"
"ldr b5, [x20, #0x0]\n"
- "154:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "148:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x28, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -3170,35 +3094,41 @@ void a64_hybrid_s8qs_dot_6x16 (
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "155:" // Height 6: Multiply loop: No odd multiplies
+ "149:" // Height 6: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 145b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 139b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
"prfm pstl1keep, [x9, #0x0]\n"
- "ldr q0, [x16, #0x0]\n"
+ "add x23, x9, x19\n"
+ "ldr q0, [x11, #0x0]\n"
"add v8.4s, v8.4s, v0.4s\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "ldr q1, [x16, #0x10]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19\n"
"add v12.4s, v12.4s, v0.4s\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "ldr q2, [x16, #0x20]\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19\n"
"add v16.4s, v16.4s, v0.4s\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "ldr q3, [x16, #0x30]\n"
- "add v20.4s, v20.4s, v0.4s\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "add x16, x16, #0x40\n"
+ "add x20, x21, x19\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19\n"
"add v24.4s, v24.4s, v0.4s\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
"add v28.4s, v28.4s, v0.4s\n"
+ "ldr q1, [x11, #0x10]\n"
+ "ldr q2, [x11, #0x20]\n"
"add v9.4s, v9.4s, v1.4s\n"
+ "ldr q3, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
"add v10.4s, v10.4s, v2.4s\n"
- "add v11.4s, v11.4s, v3.4s\n"
"add v13.4s, v13.4s, v1.4s\n"
"add v14.4s, v14.4s, v2.4s\n"
- "add v15.4s, v15.4s, v3.4s\n"
"add v17.4s, v17.4s, v1.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
"add v18.4s, v18.4s, v2.4s\n"
"add v19.4s, v19.4s, v3.4s\n"
"add v21.4s, v21.4s, v1.4s\n"
@@ -3210,30 +3140,30 @@ void a64_hybrid_s8qs_dot_6x16 (
"add v29.4s, v29.4s, v1.4s\n"
"add v30.4s, v30.4s, v2.4s\n"
"add v31.4s, v31.4s, v3.4s\n"
- "tbz %x[flags], #4, 156f\n"
- "ldr q0, [x17, #0x0]\n"
- "ldr q4, [x8, #0x0]\n"
- "ldr q1, [x17, #0x10]\n"
- "ldr q5, [x8, #0x10]\n"
- "ldr q2, [x17, #0x20]\n"
- "ldr q6, [x8, #0x20]\n"
- "ldr q3, [x17, #0x30]\n"
- "ldr q7, [x8, #0x30]\n"
- "add x17, x17, #0x40\n"
- "add x8, x8, #0x40\n"
- "b 157f\n"
- "156:" // Height 6: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x19]\n"
+ "tbz %x[flags], #4, 150f\n"
+ "ldr q0, [x12, #0x0]\n"
+ "ldr q4, [x13, #0x0]\n"
+ "ldr q1, [x12, #0x10]\n"
+ "ldr q5, [x13, #0x10]\n"
+ "ldr q2, [x12, #0x20]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q3, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "b 151f\n"
+ "150:" // Height 6: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x24]\n"
"mov v1.16b, v0.16b\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x24]\n"
"mov v2.16b, v0.16b\n"
"mov v3.16b, v0.16b\n"
"mov v5.16b, v4.16b\n"
"mov v6.16b, v4.16b\n"
"mov v7.16b, v4.16b\n"
- "157:" // Height 6: parameters loaded
+ "151:" // Height 6: parameters loaded
"sqrdmulh v8.4s, v8.4s, v4.4s\n"
"sqrdmulh v9.4s, v9.4s, v5.4s\n"
"sqrdmulh v10.4s, v10.4s, v6.4s\n"
@@ -3258,7 +3188,7 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqrdmulh v29.4s, v29.4s, v5.4s\n"
"sqrdmulh v30.4s, v30.4s, v6.4s\n"
"sqrdmulh v31.4s, v31.4s, v7.4s\n"
- "tbz %x[flags], #5, 158f\n"
+ "tbz %x[flags], #5, 152f\n"
"and v4.16b, v8.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v9.16b, v1.16b\n"
@@ -3331,18 +3261,18 @@ void a64_hybrid_s8qs_dot_6x16 (
"sqadd v29.4s, v29.4s, v5.4s\n"
"sqadd v30.4s, v30.4s, v6.4s\n"
"sqadd v31.4s, v31.4s, v7.4s\n"
- "158:" // Height 6: no shift correction
+ "152:" // Height 6: no shift correction
"srshl v8.4s, v8.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x24]\n"
"srshl v9.4s, v9.4s, v1.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
"srshl v10.4s, v10.4s, v2.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
"srshl v11.4s, v11.4s, v3.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x15, #0x10\n"
+ "ld1r { v6.4s }, [x24]\n"
+ "cmp x10, #0x10\n"
"srshl v12.4s, v12.4s, v0.4s\n"
"srshl v13.4s, v13.4s, v1.4s\n"
"srshl v14.4s, v14.4s, v2.4s\n"
@@ -3453,159 +3383,154 @@ void a64_hybrid_s8qs_dot_6x16 (
"uzp1 v20.16b, v20.16b, v21.16b\n"
"uzp1 v24.16b, v24.16b, v25.16b\n"
"uzp1 v28.16b, v28.16b, v29.16b\n"
- "bge 167f\n"
- "tbz x15, #3, 162f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x21], #0x8\n"
- "tbz x15, #2, 160f\n"
- "st1 { v8.s }[2], [x13], #0x4\n"
- "st1 { v12.s }[2], [x9], #0x4\n"
- "st1 { v16.s }[2], [x27], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x21], #0x4\n"
- "tbz x15, #1, 159f\n"
- "st1 { v8.h }[6], [x13], #0x2\n"
- "st1 { v12.h }[6], [x9], #0x2\n"
- "st1 { v16.h }[6], [x27], #0x2\n"
- "st1 { v20.h }[6], [x25], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x21], #0x2\n"
- "tbz x15, #0, 166f\n"
- "st1 { v8.b }[14], [x13]\n"
- "st1 { v12.b }[14], [x9]\n"
- "st1 { v16.b }[14], [x27]\n"
- "st1 { v20.b }[14], [x25]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x21]\n"
- "b 166f\n"
- "159:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x15, #0, 166f\n"
- "st1 { v8.b }[12], [x13]\n"
- "st1 { v12.b }[12], [x9]\n"
- "st1 { v16.b }[12], [x27]\n"
- "st1 { v20.b }[12], [x25]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x21]\n"
- "b 166f\n"
- "160:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x15, #1, 161f\n"
- "st1 { v8.h }[4], [x13], #0x2\n"
- "st1 { v12.h }[4], [x9], #0x2\n"
- "st1 { v16.h }[4], [x27], #0x2\n"
- "st1 { v20.h }[4], [x25], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x21], #0x2\n"
- "tbz x15, #0, 166f\n"
- "st1 { v8.b }[10], [x13]\n"
- "st1 { v12.b }[10], [x9]\n"
- "st1 { v16.b }[10], [x27]\n"
- "st1 { v20.b }[10], [x25]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x21]\n"
- "b 166f\n"
- "161:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x15, #0, 166f\n"
- "st1 { v8.b }[8], [x13]\n"
- "st1 { v12.b }[8], [x9]\n"
- "st1 { v16.b }[8], [x27]\n"
- "st1 { v20.b }[8], [x25]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x21]\n"
- "b 166f\n"
- "162:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x15, #2, 164f\n"
- "str s8, [x13], #0x4\n"
- "str s12, [x9], #0x4\n"
- "str s16, [x27], #0x4\n"
- "str s20, [x25], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x21], #0x4\n"
- "tbz x15, #1, 163f\n"
- "st1 { v8.h }[2], [x13], #0x2\n"
- "st1 { v12.h }[2], [x9], #0x2\n"
- "st1 { v16.h }[2], [x27], #0x2\n"
- "st1 { v20.h }[2], [x25], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x21], #0x2\n"
- "tbz x15, #0, 166f\n"
- "st1 { v8.b }[6], [x13]\n"
- "st1 { v12.b }[6], [x9]\n"
- "st1 { v16.b }[6], [x27]\n"
- "st1 { v20.b }[6], [x25]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x21]\n"
- "b 166f\n"
- "163:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x15, #0, 166f\n"
- "st1 { v8.b }[4], [x13]\n"
- "st1 { v12.b }[4], [x9]\n"
- "st1 { v16.b }[4], [x27]\n"
- "st1 { v20.b }[4], [x25]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x21]\n"
- "b 166f\n"
- "164:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x15, #1, 165f\n"
- "str h8, [x13], #0x2\n"
- "str h12, [x9], #0x2\n"
- "str h16, [x27], #0x2\n"
- "str h20, [x25], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x21], #0x2\n"
- "tbz x15, #0, 166f\n"
- "st1 { v8.b }[2], [x13]\n"
- "st1 { v12.b }[2], [x9]\n"
- "st1 { v16.b }[2], [x27]\n"
- "st1 { v20.b }[2], [x25]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x21]\n"
- "b 166f\n"
- "165:" // Height 6: Partial direct writeback: partial_1_0
- "str b8, [x13, #0x0]\n"
- "str b12, [x9, #0x0]\n"
- "str b16, [x27, #0x0]\n"
- "str b20, [x25, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x21, #0x0]\n"
- "166:" // Height 6: Partial direct writeback: Done
- "b 168f\n"
- "167:" // Height 6: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q12, [x9, #0x0]\n"
- "str q16, [x27, #0x0]\n"
- "str q20, [x25, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x21, #0x0]\n"
- "add x13, x13, #0x10\n"
+ "bge 161f\n"
+ "tbz x10, #3, 156f\n"
+ "str d8, [x9], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x10, #2, 154f\n"
+ "st1 { v8.s }[2], [x9], #0x4\n"
+ "st1 { v12.s }[2], [x23], #0x4\n"
+ "st1 { v16.s }[2], [x22], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "st1 { v28.s }[2], [x19], #0x4\n"
+ "tbz x10, #1, 153f\n"
+ "st1 { v8.h }[6], [x9], #0x2\n"
+ "st1 { v12.h }[6], [x23], #0x2\n"
+ "st1 { v16.h }[6], [x22], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "st1 { v28.h }[6], [x19], #0x2\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[14], [x9]\n"
+ "st1 { v12.b }[14], [x23]\n"
+ "st1 { v16.b }[14], [x22]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "st1 { v28.b }[14], [x19]\n"
+ "b 160f\n"
+ "153:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[12], [x9]\n"
+ "st1 { v12.b }[12], [x23]\n"
+ "st1 { v16.b }[12], [x22]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "st1 { v28.b }[12], [x19]\n"
+ "b 160f\n"
+ "154:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 155f\n"
+ "st1 { v8.h }[4], [x9], #0x2\n"
+ "st1 { v12.h }[4], [x23], #0x2\n"
+ "st1 { v16.h }[4], [x22], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "st1 { v28.h }[4], [x19], #0x2\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[10], [x9]\n"
+ "st1 { v12.b }[10], [x23]\n"
+ "st1 { v16.b }[10], [x22]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "st1 { v28.b }[10], [x19]\n"
+ "b 160f\n"
+ "155:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[8], [x9]\n"
+ "st1 { v12.b }[8], [x23]\n"
+ "st1 { v16.b }[8], [x22]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "st1 { v28.b }[8], [x19]\n"
+ "b 160f\n"
+ "156:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 158f\n"
+ "str s8, [x9], #0x4\n"
+ "str s12, [x23], #0x4\n"
+ "str s16, [x22], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "str s28, [x19], #0x4\n"
+ "tbz x10, #1, 157f\n"
+ "st1 { v8.h }[2], [x9], #0x2\n"
+ "st1 { v12.h }[2], [x23], #0x2\n"
+ "st1 { v16.h }[2], [x22], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "st1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[6], [x9]\n"
+ "st1 { v12.b }[6], [x23]\n"
+ "st1 { v16.b }[6], [x22]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "st1 { v28.b }[6], [x19]\n"
+ "b 160f\n"
+ "157:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[4], [x9]\n"
+ "st1 { v12.b }[4], [x23]\n"
+ "st1 { v16.b }[4], [x22]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "st1 { v28.b }[4], [x19]\n"
+ "b 160f\n"
+ "158:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 159f\n"
+ "str h8, [x9], #0x2\n"
+ "str h12, [x23], #0x2\n"
+ "str h16, [x22], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "str h28, [x19], #0x2\n"
+ "tbz x10, #0, 160f\n"
+ "st1 { v8.b }[2], [x9]\n"
+ "st1 { v12.b }[2], [x23]\n"
+ "st1 { v16.b }[2], [x22]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "st1 { v28.b }[2], [x19]\n"
+ "b 160f\n"
+ "159:" // Height 6: Partial direct writeback: partial_1_0
+ "str b8, [x9, #0x0]\n"
+ "str b12, [x23, #0x0]\n"
+ "str b16, [x22, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "str b28, [x19, #0x0]\n"
+ "160:" // Height 6: Partial direct writeback: Done
+ "b 162f\n"
+ "161:" // Height 6: Full writeback
+ "str q8, [x9, #0x0]\n"
"add x9, x9, #0x10\n"
- "add x27, x27, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x21, x21, #0x10\n"
- "168:" // Height 6: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 143b\n"
+ "str q12, [x23, #0x0]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q28, [x19, #0x0]\n"
+ "162:" // Height 6: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 137b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 170f\n"
+ "beq 164f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 169f\n"
+ "tbz %x[flags], #3, 163f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "169:" // Update direct input
+ "163:" // Update direct input
"mov x19, #0x6\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "170:" // Exit
+ "164:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
index 6b3f84064c..759a78a413 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -37,9 +37,9 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void a64_hybrid_s8s32_dot_6x16( ARGLIST );
+void a64_hybrid_s8s32_dot_6x16_a55( ARGLIST );
class cls_a64_hybrid_s8s32_dot_6x16
{
@@ -72,10 +72,11 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
- static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+ {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
- return { 9.5238, 2.0799, 0.2279 };
+ return { 12.667, 2.0799, 0.2279 };
default:
return { 29.6736, 11.4025, 0.5591 };
}
@@ -83,9 +84,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_s8s32_dot_6x16;
-
- cls_a64_hybrid_s8s32_dot_6x16(const CPUInfo *)
+ cls_a64_hybrid_s8s32_dot_6x16(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A55r1:
+ kernel=a64_hybrid_s8s32_dot_6x16_a55;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
new file mode 100644
index 0000000000..6b2be0a33a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp
@@ -0,0 +1,3499 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_6x16_a55 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+ const int32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 171f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 137f\n"
+ "beq 103f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 69f\n"
+ "beq 35f\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "tbz %x[flags], #0, 12f\n"
+ "cmp x14, #0x10\n"
+ "bge 11f\n"
+ "tbz x14, #3, 6f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "tbz x14, #2, 4f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "tbz x14, #1, 3f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x12], #0x8\n"
+ "tbz x14, #0, 10f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "b 10f\n"
+ "3:" // Height 1: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 10f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "b 10f\n"
+ "4:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x14, #1, 5f\n"
+ "ldr d10, [x12], #0x8\n"
+ "mov x24, #0x28\n"
+ "tbz x14, #0, 10f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "b 10f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 10f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "b 10f\n"
+ "6:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x14, #2, 8f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "tbz x14, #1, 7f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x12], #0x8\n"
+ "tbz x14, #0, 10f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "b 10f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 10f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "b 10f\n"
+ "8:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x14, #1, 9f\n"
+ "ldr d8, [x12], #0x8\n"
+ "mov x24, #0x8\n"
+ "tbz x14, #0, 10f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "b 10f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "10:" // Height 1: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 13f\n"
+ "11:" // Height 1: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "b 13f\n"
+ "12:" // Height 1: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "13:" // Height 1: setup done
+ "mov x11, #0x0\n"
+ "14:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 15f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "cbnz x11, 16f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "b 16f\n"
+ "15:" // Height 1: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "16:" // Height 1: input setup done
+ "cmp x10, #0x10\n"
+ "blt 19f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "blt 18f\n"
+ "17:" // Height 1: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x9, x9, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr x28, [x13, #0x28]\n"
+ "cmp x10, #0x20\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr d6, [x13, #0x40]\n"
+ "ldr x28, [x13, #0x48]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0x58]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr d7, [x13, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr d6, [x13, #0x60]\n"
+ "ldr x28, [x13, #0x68]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0x78]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr d6, [x13, #0x80]\n"
+ "ldr x28, [x13, #0x88]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0x98]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr d7, [x13, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "ldr x28, [x13, #0xa8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xb8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xd8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr d7, [x13, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr d6, [x13, #0x0]\n"
+ "ldr x28, [x13, #0x8]\n"
+ "mov v7.d[1], x19\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "bge 17b\n"
+ "18:" // Height 1: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x13, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "19:" // Height 1: Multiply loop: Main loop skip
+ "cbz x10, 24f\n"
+ "cmp x10, #0x4\n"
+ "blt 21f\n"
+ "20:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "cmp x10, #0x4\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "bge 20b\n"
+ "cbz x10, 24f\n"
+ "21:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 22f\n"
+ "ldr h0, [x9], #0x2\n"
+ "tbz x10, #0, 23f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "b 23f\n"
+ "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "23:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "24:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 14b\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "bge 33f\n"
+ "tbz x14, #3, 28f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "tbz x14, #2, 26f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "tbz x14, #1, 25f\n"
+ "str d11, [x12], #0x8\n"
+ "tbz x14, #0, 32f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "b 32f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 32f\n"
+ "str s11, [x12, #0x0]\n"
+ "b 32f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 27f\n"
+ "str d10, [x12], #0x8\n"
+ "tbz x14, #0, 32f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "b 32f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 32f\n"
+ "str s10, [x12, #0x0]\n"
+ "b 32f\n"
+ "28:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 30f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "tbz x14, #1, 29f\n"
+ "str d9, [x12], #0x8\n"
+ "tbz x14, #0, 32f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "b 32f\n"
+ "29:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 32f\n"
+ "str s9, [x12, #0x0]\n"
+ "b 32f\n"
+ "30:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 31f\n"
+ "str d8, [x12], #0x8\n"
+ "tbz x14, #0, 32f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "b 32f\n"
+ "31:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "32:" // Height 1: Partial direct writeback: Done
+ "b 34f\n"
+ "33:" // Height 1: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "34:" // Height 1: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 2b\n"
+ "b 206f\n"
+ "35:" // Height 2
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "36:" // Height 2: Column loop
+ "tbz %x[flags], #0, 46f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "bge 45f\n"
+ "tbz x14, #3, 40f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "tbz x14, #2, 38f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "tbz x14, #1, 37f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x12], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "tbz x14, #0, 44f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "b 44f\n"
+ "37:" // Height 2: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 44f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "b 44f\n"
+ "38:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x14, #1, 39f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "tbz x14, #0, 44f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "b 44f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 44f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "b 44f\n"
+ "40:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x14, #2, 42f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "tbz x14, #1, 41f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x12], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "tbz x14, #0, 44f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "b 44f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 44f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "b 44f\n"
+ "42:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x14, #1, 43f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "tbz x14, #0, 44f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "b 44f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "44:" // Height 2: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 47f\n"
+ "45:" // Height 2: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "b 47f\n"
+ "46:" // Height 2: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "47:" // Height 2: setup done
+ "mov x11, #0x0\n"
+ "48:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 49f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "cbnz x11, 50f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "b 50f\n"
+ "49:" // Height 2: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "50:" // Height 2: input setup done
+ "cmp x10, #0x10\n"
+ "blt 53f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 52f\n"
+ "51:" // Height 2: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "cmp x10, #0x20\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x28]\n"
+ "ldr x19, [x13, #0x38]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x40]\n"
+ "ldr x28, [x13, #0x48]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr d7, [x13, #0x50]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0x58]\n"
+ "ldr x28, [x13, #0x68]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x60]\n"
+ "ldr x19, [x13, #0x78]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0x70]\n"
+ "ldr x28, [x13, #0x88]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x80]\n"
+ "ldr x19, [x13, #0x98]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0x90]\n"
+ "ldr x28, [x13, #0xa8]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "ldr x19, [x13, #0xb8]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xb0]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "ldr x19, [x13, #0xd8]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x19, [x13, #0xf8]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x25, [x26, #0x8]\n"
+ "ldr d6, [x13, #0x0]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr x28, [x13, #0x8]\n"
+ "mov v0.d[1], x27\n"
+ "mov v1.d[1], x25\n"
+ "mov v6.d[1], x28\n"
+ "bge 51b\n"
+ "52:" // Height 2: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q6, [x13, #0x20]\n"
+ "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x13, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x13, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "53:" // Height 2: Multiply loop: Main loop skip
+ "cbz x10, 58f\n"
+ "cmp x10, #0x4\n"
+ "blt 55f\n"
+ "54:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "bge 54b\n"
+ "cbz x10, 58f\n"
+ "55:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 56f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "tbz x10, #0, 57f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "b 57f\n"
+ "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "57:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "58:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 48b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "bge 67f\n"
+ "tbz x14, #3, 62f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "tbz x14, #2, 60f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "tbz x14, #1, 59f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "tbz x14, #0, 66f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "b 66f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 66f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "b 66f\n"
+ "60:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 61f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "tbz x14, #0, 66f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "b 66f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 66f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "b 66f\n"
+ "62:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 64f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "tbz x14, #1, 63f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "tbz x14, #0, 66f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "b 66f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 66f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "b 66f\n"
+ "64:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 65f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "tbz x14, #0, 66f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "b 66f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "66:" // Height 2: Partial direct writeback: Done
+ "b 68f\n"
+ "67:" // Height 2: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "68:" // Height 2: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 36b\n"
+ "b 206f\n"
+ "69:" // Height 3
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "70:" // Height 3: Column loop
+ "tbz %x[flags], #0, 80f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "bge 79f\n"
+ "tbz x14, #3, 74f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "tbz x14, #2, 72f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "tbz x14, #1, 71f\n"
+ "ldr d11, [x12], #0x8\n"
+ "mov x24, #0x38\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "tbz x14, #0, 78f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "b 78f\n"
+ "71:" // Height 3: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 78f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "b 78f\n"
+ "72:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x14, #1, 73f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "tbz x14, #0, 78f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "b 78f\n"
+ "73:" // Height 3: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 78f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "b 78f\n"
+ "74:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x14, #2, 76f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "tbz x14, #1, 75f\n"
+ "ldr d9, [x12], #0x8\n"
+ "mov x24, #0x18\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "tbz x14, #0, 78f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "b 78f\n"
+ "75:" // Height 3: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 78f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "b 78f\n"
+ "76:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x14, #1, 77f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "tbz x14, #0, 78f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "b 78f\n"
+ "77:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "78:" // Height 3: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 81f\n"
+ "79:" // Height 3: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "b 81f\n"
+ "80:" // Height 3: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "81:" // Height 3: setup done
+ "mov x11, #0x0\n"
+ "82:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 83f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "cbnz x11, 84f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 84f\n"
+ "83:" // Height 3: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "84:" // Height 3: input setup done
+ "cmp x10, #0x10\n"
+ "blt 87f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 86f\n"
+ "85:" // Height 3: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x9, x9, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "add x26, x26, #0x10\n"
+ "ldr x28, [x13, #0x28]\n"
+ "add x24, x24, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "sub x10, x10, #0x10\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "cmp x10, #0x20\n"
+ "ldr d6, [x13, #0x40]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x48]\n"
+ "ldr x19, [x13, #0x58]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr d7, [x13, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x60]\n"
+ "ldr x28, [x13, #0x68]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0x78]\n"
+ "ldr x28, [x13, #0x88]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr d6, [x13, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr d7, [x13, #0x90]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0x98]\n"
+ "ldr x28, [x13, #0xa8]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr d6, [x13, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0xb8]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr d6, [x13, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0xd8]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr d6, [x13, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ "ldr x25, [x26, #0x8]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr d2, [x24, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "ldr x21, [x24, #0x8]\n"
+ "mov v1.d[1], x25\n"
+ "ldr d6, [x13, #0x0]\n"
+ "ldr x28, [x13, #0x8]\n"
+ "mov v2.d[1], x21\n"
+ "mov v6.d[1], x28\n"
+ "bge 85b\n"
+ "86:" // Height 3: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr q7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x13, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q6, [x13, #0x60]\n"
+ "ldr q7, [x13, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q6, [x13, #0x80]\n"
+ "ldr q7, [x13, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "87:" // Height 3: Multiply loop: Main loop skip
+ "cbz x10, 92f\n"
+ "cmp x10, #0x4\n"
+ "blt 89f\n"
+ "88:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "bge 88b\n"
+ "cbz x10, 92f\n"
+ "89:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 90f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "tbz x10, #0, 91f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "b 91f\n"
+ "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "91:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "92:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 82b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "bge 101f\n"
+ "tbz x14, #3, 96f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "tbz x14, #2, 94f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "tbz x14, #1, 93f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "tbz x14, #0, 100f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "b 100f\n"
+ "93:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 100f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "b 100f\n"
+ "94:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 95f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "tbz x14, #0, 100f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "b 100f\n"
+ "95:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 100f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "b 100f\n"
+ "96:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 98f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "tbz x14, #1, 97f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "tbz x14, #0, 100f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "b 100f\n"
+ "97:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 100f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "b 100f\n"
+ "98:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 99f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "tbz x14, #0, 100f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "b 100f\n"
+ "99:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "100:" // Height 3: Partial direct writeback: Done
+ "b 102f\n"
+ "101:" // Height 3: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "102:" // Height 3: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 70b\n"
+ "b 206f\n"
+ "103:" // Height 4
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "104:" // Height 4: Column loop
+ "tbz %x[flags], #0, 114f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "bge 113f\n"
+ "tbz x14, #3, 108f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "tbz x14, #2, 106f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "tbz x14, #1, 105f\n"
+ "ldr d11, [x12], #0x8\n"
+ "mov x24, #0x38\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "tbz x14, #0, 112f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "b 112f\n"
+ "105:" // Height 4: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 112f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "b 112f\n"
+ "106:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x14, #1, 107f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "tbz x14, #0, 112f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "b 112f\n"
+ "107:" // Height 4: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 112f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "b 112f\n"
+ "108:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x14, #2, 110f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "tbz x14, #1, 109f\n"
+ "ldr d9, [x12], #0x8\n"
+ "mov x24, #0x18\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "tbz x14, #0, 112f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "b 112f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 112f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "b 112f\n"
+ "110:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x14, #1, 111f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "tbz x14, #0, 112f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "b 112f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "112:" // Height 4: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 115f\n"
+ "113:" // Height 4: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "b 115f\n"
+ "114:" // Height 4: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "115:" // Height 4: setup done
+ "mov x11, #0x0\n"
+ "116:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 117f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "cbnz x11, 118f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 118f\n"
+ "117:" // Height 4: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "add x23, x24, x19\n"
+ "118:" // Height 4: input setup done
+ "cmp x10, #0x10\n"
+ "blt 121f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 120f\n"
+ "119:" // Height 4: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x28]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "sub x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr d6, [x13, #0x40]\n"
+ "cmp x10, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr x28, [x13, #0x48]\n"
+ "ldr d7, [x13, #0x50]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "mov v6.d[1], x28\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr x28, [x13, #0x68]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x60]\n"
+ "ldr x19, [x13, #0x78]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr d6, [x13, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr x28, [x13, #0x88]\n"
+ "ldr d7, [x13, #0x90]\n"
+ "ldr x19, [x13, #0x98]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x28, [x13, #0xa8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "ldr x19, [x13, #0xb8]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "ldr x19, [x13, #0xd8]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x0]\n"
+ "ldr x28, [x13, #0x8]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "mov v0.d[1], x27\n"
+ "ldr d2, [x24, #0x0]\n"
+ "ldr x21, [x24, #0x8]\n"
+ "mov v1.d[1], x25\n"
+ "ldr d3, [x23, #0x0]\n"
+ "ldr x19, [x23, #0x8]\n"
+ "mov v2.d[1], x21\n"
+ "mov v3.d[1], x19\n"
+ "bge 119b\n"
+ "120:" // Height 4: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q6, [x13, #0x20]\n"
+ "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q7, [x13, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q7, [x13, #0x90]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "121:" // Height 4: Multiply loop: Main loop skip
+ "cbz x10, 126f\n"
+ "cmp x10, #0x4\n"
+ "blt 123f\n"
+ "122:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr s3, [x23], #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "bge 122b\n"
+ "cbz x10, 126f\n"
+ "123:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 124f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "ldr h3, [x23], #0x2\n"
+ "tbz x10, #0, 125f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v3.b }[2], [x23]\n"
+ "b 125f\n"
+ "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "ldr b3, [x23, #0x0]\n"
+ "125:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "126:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 116b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "bge 135f\n"
+ "tbz x14, #3, 130f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "tbz x14, #2, 128f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "tbz x14, #1, 127f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "tbz x14, #0, 134f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "b 134f\n"
+ "127:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 134f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "b 134f\n"
+ "128:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 129f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "tbz x14, #0, 134f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "b 134f\n"
+ "129:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 134f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "b 134f\n"
+ "130:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 132f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "tbz x14, #1, 131f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "tbz x14, #0, 134f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "b 134f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 134f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "b 134f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 133f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x14, #0, 134f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "b 134f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "134:" // Height 4: Partial direct writeback: Done
+ "b 136f\n"
+ "135:" // Height 4: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "136:" // Height 4: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 104b\n"
+ "b 206f\n"
+ "137:" // Height 5
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "138:" // Height 5: Column loop
+ "tbz %x[flags], #0, 148f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "bge 147f\n"
+ "tbz x14, #3, 142f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "ld1 { v25.4s }, [x20], #0x10\n"
+ "tbz x14, #2, 140f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "ld1 { v26.4s }, [x20], #0x10\n"
+ "tbz x14, #1, 139f\n"
+ "ldr d11, [x12], #0x8\n"
+ "mov x24, #0x38\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d27, [x20], #0x8\n"
+ "tbz x14, #0, 146f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "ld1 { v27.s }[2], [x20]\n"
+ "b 146f\n"
+ "139:" // Height 5: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 146f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "ldr s27, [x20, #0x0]\n"
+ "b 146f\n"
+ "140:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x14, #1, 141f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "ldr d26, [x20], #0x8\n"
+ "tbz x14, #0, 146f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "ld1 { v26.s }[2], [x20]\n"
+ "b 146f\n"
+ "141:" // Height 5: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 146f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "ldr s26, [x20, #0x0]\n"
+ "b 146f\n"
+ "142:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x14, #2, 144f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "tbz x14, #1, 143f\n"
+ "ldr d9, [x12], #0x8\n"
+ "mov x24, #0x18\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "tbz x14, #0, 146f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "ld1 { v25.s }[2], [x20]\n"
+ "b 146f\n"
+ "143:" // Height 5: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 146f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "ldr s25, [x20, #0x0]\n"
+ "b 146f\n"
+ "144:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x14, #1, 145f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
+ "tbz x14, #0, 146f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "b 146f\n"
+ "145:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "ldr s24, [x20, #0x0]\n"
+ "146:" // Height 5: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 149f\n"
+ "147:" // Height 5: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "ldr q24, [x20, #0x0]\n"
+ "ldr q25, [x20, #0x10]\n"
+ "ldr q26, [x20, #0x20]\n"
+ "ldr q27, [x20, #0x30]\n"
+ "b 149f\n"
+ "148:" // Height 5: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "149:" // Height 5: setup done
+ "mov x11, #0x0\n"
+ "150:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 151f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x11, 152f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 152f\n"
+ "151:" // Height 5: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "152:" // Height 5: input setup done
+ "cmp x10, #0x10\n"
+ "blt 155f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 154f\n"
+ "153:" // Height 5: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x9, x9, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "add x26, x26, #0x10\n"
+ "ldr x28, [x13, #0x28]\n"
+ "add x24, x24, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "add x23, x23, #0x10\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "ldr d6, [x13, #0x40]\n"
+ "sub x10, x10, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x48]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "cmp x10, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr d7, [x13, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr d6, [x13, #0x60]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x68]\n"
+ "ldr x19, [x13, #0x78]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr d6, [x13, #0x80]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x88]\n"
+ "ldr x19, [x13, #0x98]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr d7, [x13, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0xa8]\n"
+ "ldr x19, [x13, #0xb8]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xd8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "ldr x21, [x24, #0x8]\n"
+ "add x13, x13, #0x100\n"
+ "ldr d6, [x13, #0x0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x8]\n"
+ "ldr x19, [x23, #0x8]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr d2, [x24, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "ldr d3, [x23, #0x0]\n"
+ "mov v1.d[1], x25\n"
+ "ldr d4, [x22, #0x0]\n"
+ "mov v2.d[1], x21\n"
+ "ldr x21, [x22, #0x8]\n"
+ "mov v3.d[1], x19\n"
+ "mov v4.d[1], x21\n"
+ "bge 153b\n"
+ "154:" // Height 5: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x23, x23, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q7, [x13, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q7, [x13, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "155:" // Height 5: Multiply loop: Main loop skip
+ "cbz x10, 160f\n"
+ "cmp x10, #0x4\n"
+ "blt 157f\n"
+ "156:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "bge 156b\n"
+ "cbz x10, 160f\n"
+ "157:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 158f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "tbz x10, #0, 159f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v3.b }[2], [x23]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "b 159f\n"
+ "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "ldr b3, [x23, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "159:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "160:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 150b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "bge 169f\n"
+ "tbz x14, #3, 164f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v25.4s }, [x20], #0x10\n"
+ "tbz x14, #2, 162f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v26.4s }, [x20], #0x10\n"
+ "tbz x14, #1, 161f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "str d27, [x20], #0x8\n"
+ "tbz x14, #0, 168f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "b 168f\n"
+ "161:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 168f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s27, [x20, #0x0]\n"
+ "b 168f\n"
+ "162:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 163f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "str d26, [x20], #0x8\n"
+ "tbz x14, #0, 168f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "b 168f\n"
+ "163:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 168f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "str s26, [x20, #0x0]\n"
+ "b 168f\n"
+ "164:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 166f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "tbz x14, #1, 165f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "str d25, [x20], #0x8\n"
+ "tbz x14, #0, 168f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "b 168f\n"
+ "165:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 168f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "str s25, [x20, #0x0]\n"
+ "b 168f\n"
+ "166:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 167f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x14, #0, 168f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "b 168f\n"
+ "167:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "str s24, [x20, #0x0]\n"
+ "168:" // Height 5: Partial direct writeback: Done
+ "b 170f\n"
+ "169:" // Height 5: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q25, [x20, #0x10]\n"
+ "str q26, [x20, #0x20]\n"
+ "str q27, [x20, #0x30]\n"
+ "170:" // Height 5: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 138b\n"
+ "b 206f\n"
+ "171:" // Height 6
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x20, #0x18\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+ "172:" // Height 6: Column loop
+ "tbz %x[flags], #0, 182f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "add x19, x20, x19, LSL #2\n"
+ "bge 181f\n"
+ "tbz x14, #3, 176f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v28.4s }, [x19], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "ld1 { v25.4s }, [x20], #0x10\n"
+ "ld1 { v29.4s }, [x19], #0x10\n"
+ "tbz x14, #2, 174f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "ld1 { v26.4s }, [x20], #0x10\n"
+ "ld1 { v30.4s }, [x19], #0x10\n"
+ "tbz x14, #1, 173f\n"
+ "ldr d11, [x12], #0x8\n"
+ "mov x24, #0x38\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d27, [x20], #0x8\n"
+ "ldr d31, [x19], #0x8\n"
+ "tbz x14, #0, 180f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "ld1 { v27.s }[2], [x20]\n"
+ "ld1 { v31.s }[2], [x19]\n"
+ "b 180f\n"
+ "173:" // Height 6: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 180f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "ldr s27, [x20, #0x0]\n"
+ "ldr s31, [x19, #0x0]\n"
+ "b 180f\n"
+ "174:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x14, #1, 175f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "ldr d26, [x20], #0x8\n"
+ "ldr d30, [x19], #0x8\n"
+ "tbz x14, #0, 180f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "ld1 { v26.s }[2], [x20]\n"
+ "ld1 { v30.s }[2], [x19]\n"
+ "b 180f\n"
+ "175:" // Height 6: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 180f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "ldr s26, [x20, #0x0]\n"
+ "ldr s30, [x19, #0x0]\n"
+ "b 180f\n"
+ "176:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x14, #2, 178f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v28.4s }, [x19], #0x10\n"
+ "tbz x14, #1, 177f\n"
+ "ldr d9, [x12], #0x8\n"
+ "mov x24, #0x18\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "ldr d29, [x19], #0x8\n"
+ "tbz x14, #0, 180f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "ld1 { v25.s }[2], [x20]\n"
+ "ld1 { v29.s }[2], [x19]\n"
+ "b 180f\n"
+ "177:" // Height 6: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 180f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "ldr s25, [x20, #0x0]\n"
+ "ldr s29, [x19, #0x0]\n"
+ "b 180f\n"
+ "178:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x14, #1, 179f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
+ "ldr d28, [x19], #0x8\n"
+ "tbz x14, #0, 180f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "ld1 { v28.s }[2], [x19]\n"
+ "b 180f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "ldr s24, [x20, #0x0]\n"
+ "ldr s28, [x19, #0x0]\n"
+ "180:" // Height 6: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 183f\n"
+ "181:" // Height 6: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "ldr q24, [x20, #0x0]\n"
+ "ldr q25, [x20, #0x10]\n"
+ "ldr q26, [x20, #0x20]\n"
+ "ldr q27, [x20, #0x30]\n"
+ "ldr q28, [x19, #0x0]\n"
+ "ldr q29, [x19, #0x10]\n"
+ "ldr q30, [x19, #0x20]\n"
+ "ldr q31, [x19, #0x30]\n"
+ "b 183f\n"
+ "182:" // Height 6: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "183:" // Height 6: setup done
+ "mov x11, #0x0\n"
+ "184:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 185f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x11, 186f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 186f\n"
+ "185:" // Height 6: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x20, x22, x19\n"
+ "186:" // Height 6: input setup done
+ "cmp x10, #0x10\n"
+ "blt 189f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 188f\n"
+ "187:" // Height 6: Multiply loop: Main loop head
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x28]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "ldr d6, [x13, #0x40]\n"
+ "sub x10, x10, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x48]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "cmp x10, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr d7, [x13, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr d6, [x13, #0x60]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x68]\n"
+ "ldr x19, [x13, #0x78]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr d6, [x13, #0x80]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x88]\n"
+ "ldr x19, [x13, #0x98]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr d7, [x13, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0xa8]\n"
+ "ldr x19, [x13, #0xb8]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0xc8]\n"
+ "ldr x19, [x13, #0xd8]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "add x13, x13, #0x100\n"
+ "ldr d6, [x13, #0x0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x8]\n"
+ "ldr x21, [x24, #0x8]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr d2, [x24, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "ldr d3, [x23, #0x0]\n"
+ "mov v1.d[1], x25\n"
+ "ldr x19, [x23, #0x8]\n"
+ "mov v2.d[1], x21\n"
+ "ldr d4, [x22, #0x0]\n"
+ "ldr x21, [x22, #0x8]\n"
+ "mov v3.d[1], x19\n"
+ "ldr d5, [x20, #0x0]\n"
+ "ldr x19, [x20, #0x8]\n"
+ "mov v4.d[1], x21\n"
+ "mov v5.d[1], x19\n"
+ "bge 187b\n"
+ "188:" // Height 6: Multiply loop: Single iteration only
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q6, [x13, #0x20]\n"
+ "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "ldr q6, [x13, #0x40]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q7, [x13, #0x50]\n"
+ "add x20, x20, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q7, [x13, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "189:" // Height 6: Multiply loop: Main loop skip
+ "cbz x10, 194f\n"
+ "cmp x10, #0x4\n"
+ "blt 191f\n"
+ "190:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "bge 190b\n"
+ "cbz x10, 194f\n"
+ "191:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 192f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "tbz x10, #0, 193f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v3.b }[2], [x23]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 193f\n"
+ "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "ldr b3, [x23, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "ldr b5, [x20, #0x0]\n"
+ "193:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "194:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 184b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19, LSL #2\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
+ "bge 203f\n"
+ "tbz x14, #3, 198f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v25.4s }, [x20], #0x10\n"
+ "st1 { v28.4s }, [x19], #0x10\n"
+ "st1 { v29.4s }, [x19], #0x10\n"
+ "tbz x14, #2, 196f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v26.4s }, [x20], #0x10\n"
+ "st1 { v30.4s }, [x19], #0x10\n"
+ "tbz x14, #1, 195f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "str d27, [x20], #0x8\n"
+ "str d31, [x19], #0x8\n"
+ "tbz x14, #0, 202f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "st1 { v31.s }[2], [x19]\n"
+ "b 202f\n"
+ "195:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 202f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s27, [x20, #0x0]\n"
+ "str s31, [x19, #0x0]\n"
+ "b 202f\n"
+ "196:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 197f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "str d26, [x20], #0x8\n"
+ "str d30, [x19], #0x8\n"
+ "tbz x14, #0, 202f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "st1 { v30.s }[2], [x19]\n"
+ "b 202f\n"
+ "197:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 202f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "str s26, [x20, #0x0]\n"
+ "str s30, [x19, #0x0]\n"
+ "b 202f\n"
+ "198:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 200f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v28.4s }, [x19], #0x10\n"
+ "tbz x14, #1, 199f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "str d25, [x20], #0x8\n"
+ "str d29, [x19], #0x8\n"
+ "tbz x14, #0, 202f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "st1 { v29.s }[2], [x19]\n"
+ "b 202f\n"
+ "199:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 202f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "str s25, [x20, #0x0]\n"
+ "str s29, [x19, #0x0]\n"
+ "b 202f\n"
+ "200:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 201f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x14, #0, 202f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "b 202f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "str s24, [x20, #0x0]\n"
+ "str s28, [x19, #0x0]\n"
+ "202:" // Height 6: Partial direct writeback: Done
+ "b 204f\n"
+ "203:" // Height 6: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q25, [x20, #0x10]\n"
+ "str q26, [x20, #0x20]\n"
+ "str q27, [x20, #0x30]\n"
+ "str q28, [x19, #0x0]\n"
+ "str q29, [x19, #0x10]\n"
+ "str q30, [x19, #0x20]\n"
+ "str q31, [x19, #0x30]\n"
+ "204:" // Height 6: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 172b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 206f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 205f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "205:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "206:" // Exit
+
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
index 0423a9de11..3566027a50 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -80,392 +80,378 @@ void a64_hybrid_s8s32_dot_6x16 (
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 176f\n"
+ "bge 171f\n"
"cmp %x[M], #0x4\n"
- "bgt 141f\n"
- "beq 106f\n"
+ "bgt 137f\n"
+ "beq 103f\n"
"cmp %x[M], #0x2\n"
- "bgt 71f\n"
- "beq 36f\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
- "tbz %x[flags], #0, 13f\n"
- "cmp x15, #0x10\n"
- "bge 12f\n"
- "tbz x15, #3, 7f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "tbz x15, #2, 5f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "tbz x15, #1, 4f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "tbz x15, #0, 11f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "b 11f\n"
- "4:" // Height 1: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 11f\n"
- "ldr s11, [x13, #0x0]\n"
- "b 11f\n"
- "5:" // Height 1: Partial accumulate: partial_2_8
- "tbz x15, #1, 6f\n"
- "ldr d10, [x13], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 11f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "b 11f\n"
- "6:" // Height 1: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 11f\n"
- "ldr s10, [x13, #0x0]\n"
- "b 11f\n"
- "7:" // Height 1: Partial accumulate: partial_4_0
- "tbz x15, #2, 9f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "tbz x15, #1, 8f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "tbz x15, #0, 11f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "b 11f\n"
- "8:" // Height 1: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 11f\n"
- "ldr s9, [x13, #0x0]\n"
- "b 11f\n"
- "9:" // Height 1: Partial accumulate: partial_2_0
- "tbz x15, #1, 10f\n"
- "ldr d8, [x13], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 11f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "b 11f\n"
- "10:" // Height 1: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "11:" // Height 1: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "b 14f\n"
- "12:" // Height 1: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "b 14f\n"
- "13:" // Height 1: no accumulate
+ "bgt 69f\n"
+ "beq 35f\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "tbz %x[flags], #0, 12f\n"
+ "cmp x10, #0x10\n"
+ "bge 11f\n"
+ "tbz x10, #3, 6f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "tbz x10, #2, 4f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "tbz x10, #1, 3f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "tbz x10, #0, 10f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "b 10f\n"
+ "3:" // Height 1: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 10f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "b 10f\n"
+ "4:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x10, #1, 5f\n"
+ "ldr d10, [x28], #0x8\n"
+ "mov x24, #0x28\n"
+ "tbz x10, #0, 10f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "b 10f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 10f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "b 10f\n"
+ "6:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x10, #2, 8f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "tbz x10, #1, 7f\n"
+ "ldr d9, [x28], #0x8\n"
+ "mov x24, #0x18\n"
+ "tbz x10, #0, 10f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "b 10f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 10f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "b 10f\n"
+ "8:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x10, #1, 9f\n"
+ "ldr d8, [x28], #0x8\n"
+ "mov x24, #0x8\n"
+ "tbz x10, #0, 10f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "b 10f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "10:" // Height 1: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 13f\n"
+ "11:" // Height 1: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "b 13f\n"
+ "12:" // Height 1: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
"movi v11.4s, #0x0\n"
- "14:" // Height 1: setup done
- "mov x12, #0x0\n"
- "15:" // Height 1: String loop
+ "13:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 16f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 15f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 17f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 16f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "b 17f\n"
- "16:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "17:" // Height 1: input setup done
- "cmp x11, #0x10\n"
- "blt 20f\n"
- "cmp x11, #0x20\n"
+ "add x25, x25, x19\n"
+ "b 16f\n"
+ "15:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "16:" // Height 1: input setup done
+ "cmp x26, #0x10\n"
"blt 19f\n"
- "18:" // Height 1: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "blt 18f\n"
+ "17:" // Height 1: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x9, #0x30]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q6, [x9, #0x40]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
- "sub x11, x11, #0x10\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
- "cmp x11, #0x20\n"
+ "ldr q7, [x9, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- "bge 18b\n"
- "19:" // Height 1: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "bge 17b\n"
+ "18:" // Height 1: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x9, #0x30]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
- "20:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 25f\n"
- "cmp x11, #0x4\n"
- "blt 22f\n"
- "21:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "19:" // Height 1: Multiply loop: Main loop skip
+ "cbz x26, 24f\n"
+ "cmp x26, #0x4\n"
+ "blt 21f\n"
+ "20:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "cmp x26, #0x4\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "sub x11, x11, #0x4\n"
- "add x14, x14, #0x40\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "cmp x11, #0x4\n"
- "bge 21b\n"
- "cbz x11, 25f\n"
- "22:" // Height 1: Multiply loop: Skip odd blocks
- "tbz x11, #1, 23f\n"
- "ldr h0, [x10], #0x2\n"
- "tbz x11, #0, 24f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "b 24f\n"
- "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "24:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 20b\n"
+ "cbz x26, 24f\n"
+ "21:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 22f\n"
+ "ldr h0, [x25], #0x2\n"
+ "tbz x26, #0, 23f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "b 23f\n"
+ "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "23:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
- "25:" // Height 1: Multiply loop: No odd multiplies
+ "24:" // Height 1: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 15b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "cmp x15, #0x10\n"
- "bge 34f\n"
- "tbz x15, #3, 29f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "tbz x15, #2, 27f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "tbz x15, #1, 26f\n"
- "str d11, [x13], #0x8\n"
- "tbz x15, #0, 33f\n"
- "st1 { v11.s }[2], [x13]\n"
- "b 33f\n"
- "26:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x15, #0, 33f\n"
- "str s11, [x13, #0x0]\n"
- "b 33f\n"
- "27:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x15, #1, 28f\n"
- "str d10, [x13], #0x8\n"
- "tbz x15, #0, 33f\n"
- "st1 { v10.s }[2], [x13]\n"
- "b 33f\n"
- "28:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x15, #0, 33f\n"
- "str s10, [x13, #0x0]\n"
- "b 33f\n"
- "29:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x15, #2, 31f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "tbz x15, #1, 30f\n"
- "str d9, [x13], #0x8\n"
- "tbz x15, #0, 33f\n"
- "st1 { v9.s }[2], [x13]\n"
- "b 33f\n"
- "30:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x15, #0, 33f\n"
- "str s9, [x13, #0x0]\n"
- "b 33f\n"
- "31:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x15, #1, 32f\n"
- "str d8, [x13], #0x8\n"
- "tbz x15, #0, 33f\n"
- "st1 { v8.s }[2], [x13]\n"
- "b 33f\n"
- "32:" // Height 1: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "33:" // Height 1: Partial direct writeback: Done
- "b 35f\n"
- "34:" // Height 1: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "add x13, x13, #0x40\n"
- "35:" // Height 1: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 3b\n"
- "b 212f\n"
- "36:" // Height 2
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 14b\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "bge 33f\n"
+ "tbz x10, #3, 28f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "tbz x10, #2, 26f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "tbz x10, #1, 25f\n"
+ "str d11, [x28], #0x8\n"
+ "tbz x10, #0, 32f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "b 32f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 32f\n"
+ "str s11, [x28, #0x0]\n"
+ "b 32f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 27f\n"
+ "str d10, [x28], #0x8\n"
+ "tbz x10, #0, 32f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "b 32f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 32f\n"
+ "str s10, [x28, #0x0]\n"
+ "b 32f\n"
+ "28:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 30f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "tbz x10, #1, 29f\n"
+ "str d9, [x28], #0x8\n"
+ "tbz x10, #0, 32f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "b 32f\n"
+ "29:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 32f\n"
+ "str s9, [x28, #0x0]\n"
+ "b 32f\n"
+ "30:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 31f\n"
+ "str d8, [x28], #0x8\n"
+ "tbz x10, #0, 32f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "b 32f\n"
+ "31:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "32:" // Height 1: Partial direct writeback: Done
+ "b 34f\n"
+ "33:" // Height 1: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "34:" // Height 1: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 2b\n"
+ "b 206f\n"
+ "35:" // Height 2
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "36:" // Height 2: Column loop
+ "tbz %x[flags], #0, 46f\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 37f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #2\n"
- "b 38f\n"
- "37:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "38:" // Height 2: Column loop
- "tbz %x[flags], #0, 48f\n"
- "cmp x15, #0x10\n"
- "bge 47f\n"
- "tbz x15, #3, 42f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "tbz x15, #2, 40f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "tbz x15, #1, 39f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "tbz x15, #0, 46f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "b 46f\n"
- "39:" // Height 2: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 46f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "b 46f\n"
- "40:" // Height 2: Partial accumulate: partial_2_8
- "tbz x15, #1, 41f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 46f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "b 46f\n"
- "41:" // Height 2: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 46f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "b 46f\n"
- "42:" // Height 2: Partial accumulate: partial_4_0
- "tbz x15, #2, 44f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "tbz x15, #1, 43f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "tbz x15, #0, 46f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "b 46f\n"
- "43:" // Height 2: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 46f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "b 46f\n"
- "44:" // Height 2: Partial accumulate: partial_2_0
- "tbz x15, #1, 45f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 46f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "b 46f\n"
- "45:" // Height 2: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "46:" // Height 2: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "b 49f\n"
- "47:" // Height 2: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "b 49f\n"
- "48:" // Height 2: no accumulate
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "bge 45f\n"
+ "tbz x10, #3, 40f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "tbz x10, #2, 38f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "tbz x10, #1, 37f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "tbz x10, #0, 44f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "b 44f\n"
+ "37:" // Height 2: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 44f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "b 44f\n"
+ "38:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x10, #1, 39f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "tbz x10, #0, 44f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "b 44f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 44f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "b 44f\n"
+ "40:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x10, #2, 42f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "tbz x10, #1, 41f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "tbz x10, #0, 44f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "b 44f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 44f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "b 44f\n"
+ "42:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x10, #1, 43f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "tbz x10, #0, 44f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "b 44f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "44:" // Height 2: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 47f\n"
+ "45:" // Height 2: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "b 47f\n"
+ "46:" // Height 2: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -474,405 +460,395 @@ void a64_hybrid_s8s32_dot_6x16 (
"movi v13.4s, #0x0\n"
"movi v14.4s, #0x0\n"
"movi v15.4s, #0x0\n"
- "49:" // Height 2: setup done
- "mov x12, #0x0\n"
- "50:" // Height 2: String loop
+ "47:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 51f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 49f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 52f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 50f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "b 52f\n"
- "51:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "52:" // Height 2: input setup done
- "cmp x11, #0x10\n"
- "blt 55f\n"
- "cmp x11, #0x20\n"
- "blt 54f\n"
- "53:" // Height 2: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "b 50f\n"
+ "49:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "50:" // Height 2: input setup done
+ "cmp x26, #0x10\n"
+ "blt 53f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 52f\n"
+ "51:" // Height 2: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
- "cmp x11, #0x20\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "bge 53b\n"
- "54:" // Height 2: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "bge 51b\n"
+ "52:" // Height 2: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
- "55:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 60f\n"
- "cmp x11, #0x4\n"
- "blt 57f\n"
- "56:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "53:" // Height 2: Multiply loop: Main loop skip
+ "cbz x26, 58f\n"
+ "cmp x26, #0x4\n"
+ "blt 55f\n"
+ "54:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "sub x11, x11, #0x4\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "bge 56b\n"
- "cbz x11, 60f\n"
- "57:" // Height 2: Multiply loop: Skip odd blocks
- "tbz x11, #1, 58f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "tbz x11, #0, 59f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "b 59f\n"
- "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "59:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 54b\n"
+ "cbz x26, 58f\n"
+ "55:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 56f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "tbz x26, #0, 57f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "b 57f\n"
+ "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "57:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
- "60:" // Height 2: Multiply loop: No odd multiplies
+ "58:" // Height 2: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 50b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "bge 69f\n"
- "tbz x15, #3, 64f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "tbz x15, #2, 62f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "tbz x15, #1, 61f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "tbz x15, #0, 68f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "b 68f\n"
- "61:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x15, #0, 68f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "b 68f\n"
- "62:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x15, #1, 63f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "tbz x15, #0, 68f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "b 68f\n"
- "63:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x15, #0, 68f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "b 68f\n"
- "64:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x15, #2, 66f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "tbz x15, #1, 65f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "tbz x15, #0, 68f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "b 68f\n"
- "65:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x15, #0, 68f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "b 68f\n"
- "66:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x15, #1, 67f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "tbz x15, #0, 68f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 48b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "bge 67f\n"
+ "tbz x10, #3, 62f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "tbz x10, #2, 60f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "tbz x10, #1, 59f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "tbz x10, #0, 66f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "b 66f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 66f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "b 66f\n"
+ "60:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 61f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "tbz x10, #0, 66f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "b 66f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 66f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "b 66f\n"
+ "62:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 64f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "tbz x10, #1, 63f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "tbz x10, #0, 66f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "b 66f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 66f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "b 66f\n"
+ "64:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 65f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "tbz x10, #0, 66f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "b 66f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "66:" // Height 2: Partial direct writeback: Done
"b 68f\n"
- "67:" // Height 2: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "68:" // Height 2: Partial direct writeback: Done
- "b 70f\n"
- "69:" // Height 2: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "70:" // Height 2: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 38b\n"
- "b 212f\n"
- "71:" // Height 3
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "67:" // Height 2: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "68:" // Height 2: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 36b\n"
+ "b 206f\n"
+ "69:" // Height 3
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "70:" // Height 3: Column loop
+ "tbz %x[flags], #0, 80f\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 72f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 73f\n"
- "72:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "73:" // Height 3: Column loop
- "tbz %x[flags], #0, 83f\n"
- "cmp x15, #0x10\n"
- "bge 82f\n"
- "tbz x15, #3, 77f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "tbz x15, #2, 75f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "tbz x15, #1, 74f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "tbz x15, #0, 81f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "b 81f\n"
- "74:" // Height 3: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 81f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "b 81f\n"
- "75:" // Height 3: Partial accumulate: partial_2_8
- "tbz x15, #1, 76f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 81f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "bge 79f\n"
+ "tbz x10, #3, 74f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "tbz x10, #2, 72f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "tbz x10, #1, 71f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "tbz x10, #0, 78f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "b 78f\n"
+ "71:" // Height 3: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 78f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "b 78f\n"
+ "72:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x10, #1, 73f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "tbz x10, #0, 78f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "b 78f\n"
+ "73:" // Height 3: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 78f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "b 78f\n"
+ "74:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x10, #2, 76f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "tbz x10, #1, 75f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "tbz x10, #0, 78f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "b 78f\n"
+ "75:" // Height 3: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 78f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "b 78f\n"
+ "76:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x10, #1, 77f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "tbz x10, #0, 78f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "b 78f\n"
+ "77:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "78:" // Height 3: Partial accumulate: Done
+ "sub x28, x28, x24\n"
"b 81f\n"
- "76:" // Height 3: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 81f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
+ "79:" // Height 3: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
"b 81f\n"
- "77:" // Height 3: Partial accumulate: partial_4_0
- "tbz x15, #2, 79f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "tbz x15, #1, 78f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "tbz x15, #0, 81f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "b 81f\n"
- "78:" // Height 3: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 81f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "b 81f\n"
- "79:" // Height 3: Partial accumulate: partial_2_0
- "tbz x15, #1, 80f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 81f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "b 81f\n"
- "80:" // Height 3: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "81:" // Height 3: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "b 84f\n"
- "82:" // Height 3: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "b 84f\n"
- "83:" // Height 3: no accumulate
+ "80:" // Height 3: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -885,504 +861,491 @@ void a64_hybrid_s8s32_dot_6x16 (
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
"movi v19.4s, #0x0\n"
- "84:" // Height 3: setup done
- "mov x12, #0x0\n"
- "85:" // Height 3: String loop
+ "81:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 86f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 83f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 87f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 84f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
- "b 87f\n"
- "86:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "87:" // Height 3: input setup done
- "cmp x11, #0x10\n"
- "blt 90f\n"
- "cmp x11, #0x20\n"
- "blt 89f\n"
- "88:" // Height 3: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 84f\n"
+ "83:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "84:" // Height 3: input setup done
+ "cmp x26, #0x10\n"
+ "blt 87f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 86f\n"
+ "85:" // Height 3: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "bge 88b\n"
- "89:" // Height 3: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "bge 85b\n"
+ "86:" // Height 3: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
- "90:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 95f\n"
- "cmp x11, #0x4\n"
- "blt 92f\n"
- "91:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "87:" // Height 3: Multiply loop: Main loop skip
+ "cbz x26, 92f\n"
+ "cmp x26, #0x4\n"
+ "blt 89f\n"
+ "88:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "cmp x11, #0x4\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "bge 91b\n"
- "cbz x11, 95f\n"
- "92:" // Height 3: Multiply loop: Skip odd blocks
- "tbz x11, #1, 93f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "tbz x11, #0, 94f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "b 94f\n"
- "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "94:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 88b\n"
+ "cbz x26, 92f\n"
+ "89:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 90f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "tbz x26, #0, 91f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "b 91f\n"
+ "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "91:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
- "95:" // Height 3: Multiply loop: No odd multiplies
+ "92:" // Height 3: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 85b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "bge 104f\n"
- "tbz x15, #3, 99f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "tbz x15, #2, 97f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "tbz x15, #1, 96f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "tbz x15, #0, 103f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "b 103f\n"
- "96:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x15, #0, 103f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "b 103f\n"
- "97:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x15, #1, 98f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "tbz x15, #0, 103f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "b 103f\n"
- "98:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x15, #0, 103f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "b 103f\n"
- "99:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x15, #2, 101f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "tbz x15, #1, 100f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "tbz x15, #0, 103f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "b 103f\n"
- "100:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x15, #0, 103f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "b 103f\n"
- "101:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x15, #1, 102f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "tbz x15, #0, 103f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "b 103f\n"
- "102:" // Height 3: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "103:" // Height 3: Partial direct writeback: Done
- "b 105f\n"
- "104:" // Height 3: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "105:" // Height 3: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 73b\n"
- "b 212f\n"
- "106:" // Height 4
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 82b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 107f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 108f\n"
- "107:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "108:" // Height 4: Column loop
- "tbz %x[flags], #0, 118f\n"
- "cmp x15, #0x10\n"
- "bge 117f\n"
- "tbz x15, #3, 112f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "tbz x15, #2, 110f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "tbz x15, #1, 109f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "tbz x15, #0, 116f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "b 116f\n"
- "109:" // Height 4: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 116f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "b 116f\n"
- "110:" // Height 4: Partial accumulate: partial_2_8
- "tbz x15, #1, 111f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 116f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "b 116f\n"
- "111:" // Height 4: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 116f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "b 116f\n"
- "112:" // Height 4: Partial accumulate: partial_4_0
- "tbz x15, #2, 114f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "tbz x15, #1, 113f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "tbz x15, #0, 116f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "b 116f\n"
- "113:" // Height 4: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 116f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "b 116f\n"
- "114:" // Height 4: Partial accumulate: partial_2_0
- "tbz x15, #1, 115f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 116f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "b 116f\n"
- "115:" // Height 4: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "116:" // Height 4: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "b 119f\n"
- "117:" // Height 4: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "b 119f\n"
- "118:" // Height 4: no accumulate
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "bge 101f\n"
+ "tbz x10, #3, 96f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "tbz x10, #2, 94f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "tbz x10, #1, 93f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "tbz x10, #0, 100f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "b 100f\n"
+ "93:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 100f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "b 100f\n"
+ "94:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 95f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "tbz x10, #0, 100f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "b 100f\n"
+ "95:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 100f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "b 100f\n"
+ "96:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 98f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "tbz x10, #1, 97f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "tbz x10, #0, 100f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "b 100f\n"
+ "97:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 100f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "b 100f\n"
+ "98:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 99f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "tbz x10, #0, 100f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "b 100f\n"
+ "99:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "100:" // Height 3: Partial direct writeback: Done
+ "b 102f\n"
+ "101:" // Height 3: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "102:" // Height 3: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 70b\n"
+ "b 206f\n"
+ "103:" // Height 4
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "104:" // Height 4: Column loop
+ "tbz %x[flags], #0, 114f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "bge 113f\n"
+ "tbz x10, #3, 108f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "tbz x10, #2, 106f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "tbz x10, #1, 105f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "tbz x10, #0, 112f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "b 112f\n"
+ "105:" // Height 4: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 112f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "b 112f\n"
+ "106:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x10, #1, 107f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "tbz x10, #0, 112f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "b 112f\n"
+ "107:" // Height 4: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 112f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "b 112f\n"
+ "108:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x10, #2, 110f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "tbz x10, #1, 109f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "tbz x10, #0, 112f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "b 112f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 112f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "b 112f\n"
+ "110:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x10, #1, 111f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "tbz x10, #0, 112f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "b 112f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "112:" // Height 4: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 115f\n"
+ "113:" // Height 4: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "b 115f\n"
+ "114:" // Height 4: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -1399,220 +1362,220 @@ void a64_hybrid_s8s32_dot_6x16 (
"movi v21.4s, #0x0\n"
"movi v22.4s, #0x0\n"
"movi v23.4s, #0x0\n"
- "119:" // Height 4: setup done
- "mov x12, #0x0\n"
- "120:" // Height 4: String loop
+ "115:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 121f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 117f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 122f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 118f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
- "b 122f\n"
- "121:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "122:" // Height 4: input setup done
- "cmp x11, #0x10\n"
- "blt 125f\n"
- "cmp x11, #0x20\n"
- "blt 124f\n"
- "123:" // Height 4: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 118f\n"
+ "117:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "118:" // Height 4: input setup done
+ "cmp x26, #0x10\n"
+ "blt 121f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 120f\n"
+ "119:" // Height 4: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x11, x11, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "bge 123b\n"
- "124:" // Height 4: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "bge 119b\n"
+ "120:" // Height 4: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
@@ -1621,31 +1584,31 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
- "125:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 130f\n"
- "cmp x11, #0x4\n"
- "blt 127f\n"
- "126:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "121:" // Height 4: Multiply loop: Main loop skip
+ "cbz x26, 126f\n"
+ "cmp x26, #0x4\n"
+ "blt 123f\n"
+ "122:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -1653,40 +1616,40 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "bge 126b\n"
- "cbz x11, 130f\n"
- "127:" // Height 4: Multiply loop: Skip odd blocks
- "tbz x11, #1, 128f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "tbz x11, #0, 129f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "b 129f\n"
- "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "129:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 122b\n"
+ "cbz x26, 126f\n"
+ "123:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 124f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "tbz x26, #0, 125f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "ld1 { v3.b }[2], [x22]\n"
+ "b 125f\n"
+ "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "ldr b3, [x22, #0x0]\n"
+ "125:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -1694,308 +1657,292 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
- "130:" // Height 4: Multiply loop: No odd multiplies
+ "126:" // Height 4: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 120b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "bge 139f\n"
- "tbz x15, #3, 134f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "tbz x15, #2, 132f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "tbz x15, #1, 131f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "tbz x15, #0, 138f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "b 138f\n"
- "131:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x15, #0, 138f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "b 138f\n"
- "132:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x15, #1, 133f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "tbz x15, #0, 138f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "b 138f\n"
- "133:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x15, #0, 138f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "b 138f\n"
- "134:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x15, #2, 136f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "tbz x15, #1, 135f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "tbz x15, #0, 138f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "b 138f\n"
- "135:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x15, #0, 138f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "b 138f\n"
- "136:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x15, #1, 137f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "tbz x15, #0, 138f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "b 138f\n"
- "137:" // Height 4: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "138:" // Height 4: Partial direct writeback: Done
- "b 140f\n"
- "139:" // Height 4: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "140:" // Height 4: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 108b\n"
- "b 212f\n"
- "141:" // Height 5
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 116b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 142f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 143f\n"
- "142:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "143:" // Height 5: Column loop
- "tbz %x[flags], #0, 153f\n"
- "cmp x15, #0x10\n"
- "bge 152f\n"
- "tbz x15, #3, 147f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "tbz x15, #2, 145f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "tbz x15, #1, 144f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "tbz x15, #0, 151f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "b 151f\n"
- "144:" // Height 5: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 151f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "b 151f\n"
- "145:" // Height 5: Partial accumulate: partial_2_8
- "tbz x15, #1, 146f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 151f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "b 151f\n"
- "146:" // Height 5: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 151f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "b 151f\n"
- "147:" // Height 5: Partial accumulate: partial_4_0
- "tbz x15, #2, 149f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "tbz x15, #1, 148f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "tbz x15, #0, 151f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "b 151f\n"
- "148:" // Height 5: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 151f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "b 151f\n"
- "149:" // Height 5: Partial accumulate: partial_2_0
- "tbz x15, #1, 150f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 151f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "b 151f\n"
- "150:" // Height 5: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "151:" // Height 5: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "b 154f\n"
- "152:" // Height 5: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "b 154f\n"
- "153:" // Height 5: no accumulate
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "bge 135f\n"
+ "tbz x10, #3, 130f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "tbz x10, #2, 128f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "tbz x10, #1, 127f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "tbz x10, #0, 134f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "b 134f\n"
+ "127:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 134f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "b 134f\n"
+ "128:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 129f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "tbz x10, #0, 134f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "b 134f\n"
+ "129:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 134f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "b 134f\n"
+ "130:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 132f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "tbz x10, #1, 131f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "tbz x10, #0, 134f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "b 134f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 134f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "b 134f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 133f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x10, #0, 134f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "b 134f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "134:" // Height 4: Partial direct writeback: Done
+ "b 136f\n"
+ "135:" // Height 4: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "136:" // Height 4: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 104b\n"
+ "b 206f\n"
+ "137:" // Height 5
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "138:" // Height 5: Column loop
+ "tbz %x[flags], #0, 148f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "bge 147f\n"
+ "tbz x10, #3, 142f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "ld1 { v25.4s }, [x20], #0x10\n"
+ "tbz x10, #2, 140f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "ld1 { v26.4s }, [x20], #0x10\n"
+ "tbz x10, #1, 139f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d27, [x20], #0x8\n"
+ "tbz x10, #0, 146f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "ld1 { v27.s }[2], [x20]\n"
+ "b 146f\n"
+ "139:" // Height 5: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 146f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "ldr s27, [x20, #0x0]\n"
+ "b 146f\n"
+ "140:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x10, #1, 141f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "ldr d26, [x20], #0x8\n"
+ "tbz x10, #0, 146f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "ld1 { v26.s }[2], [x20]\n"
+ "b 146f\n"
+ "141:" // Height 5: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 146f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "ldr s26, [x20, #0x0]\n"
+ "b 146f\n"
+ "142:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x10, #2, 144f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "tbz x10, #1, 143f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "tbz x10, #0, 146f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "ld1 { v25.s }[2], [x20]\n"
+ "b 146f\n"
+ "143:" // Height 5: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 146f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "ldr s25, [x20, #0x0]\n"
+ "b 146f\n"
+ "144:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x10, #1, 145f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
+ "tbz x10, #0, 146f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "b 146f\n"
+ "145:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "ldr s24, [x20, #0x0]\n"
+ "146:" // Height 5: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 149f\n"
+ "147:" // Height 5: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "ldr q24, [x20, #0x0]\n"
+ "ldr q25, [x20, #0x10]\n"
+ "ldr q26, [x20, #0x20]\n"
+ "ldr q27, [x20, #0x30]\n"
+ "b 149f\n"
+ "148:" // Height 5: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -2016,260 +1963,260 @@ void a64_hybrid_s8s32_dot_6x16 (
"movi v25.4s, #0x0\n"
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- "154:" // Height 5: setup done
- "mov x12, #0x0\n"
- "155:" // Height 5: String loop
+ "149:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 156f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 151f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 157f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 152f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 157f\n"
- "156:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "157:" // Height 5: input setup done
- "cmp x11, #0x10\n"
- "blt 160f\n"
- "cmp x11, #0x20\n"
- "blt 159f\n"
- "158:" // Height 5: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x21, x21, x19\n"
+ "b 152f\n"
+ "151:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "152:" // Height 5: input setup done
+ "cmp x26, #0x10\n"
+ "blt 155f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 154f\n"
+ "153:" // Height 5: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "cmp x26, #0x20\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "sub x11, x11, #0x10\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "ldr q3, [x22, #0x0]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
- "bge 158b\n"
- "159:" // Height 5: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "bge 153b\n"
+ "154:" // Height 5: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
@@ -2279,34 +2226,34 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
- "160:" // Height 5: Multiply loop: Main loop skip
- "cbz x11, 165f\n"
- "cmp x11, #0x4\n"
- "blt 162f\n"
- "161:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "155:" // Height 5: Multiply loop: Main loop skip
+ "cbz x26, 160f\n"
+ "cmp x26, #0x4\n"
+ "blt 157f\n"
+ "156:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -2316,45 +2263,45 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "bge 161b\n"
- "cbz x11, 165f\n"
- "162:" // Height 5: Multiply loop: Skip odd blocks
- "tbz x11, #1, 163f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "tbz x11, #0, 164f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x22]\n"
- "b 164f\n"
- "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "ldr b4, [x22, #0x0]\n"
- "164:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 156b\n"
+ "cbz x26, 160f\n"
+ "157:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 158f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h4, [x21], #0x2\n"
+ "tbz x26, #0, 159f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "ld1 { v3.b }[2], [x22]\n"
+ "ld1 { v4.b }[2], [x21]\n"
+ "b 159f\n"
+ "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "ldr b3, [x22, #0x0]\n"
+ "ldr b4, [x21, #0x0]\n"
+ "159:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -2364,356 +2311,338 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
- "165:" // Height 5: Multiply loop: No odd multiplies
+ "160:" // Height 5: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 155b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 150b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "bge 174f\n"
- "tbz x15, #3, 169f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "tbz x15, #2, 167f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "tbz x15, #1, 166f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "tbz x15, #0, 173f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "st1 { v27.s }[2], [x23]\n"
- "b 173f\n"
- "166:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x15, #0, 173f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "b 173f\n"
- "167:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x15, #1, 168f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "tbz x15, #0, 173f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v26.s }[2], [x23]\n"
- "b 173f\n"
- "168:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x15, #0, 173f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "b 173f\n"
- "169:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x15, #2, 171f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "tbz x15, #1, 170f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "tbz x15, #0, 173f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v25.s }[2], [x23]\n"
- "b 173f\n"
- "170:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x15, #0, 173f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "b 173f\n"
- "171:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x15, #1, 172f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x15, #0, 173f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v24.s }[2], [x23]\n"
- "b 173f\n"
- "172:" // Height 5: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "173:" // Height 5: Partial direct writeback: Done
- "b 175f\n"
- "174:" // Height 5: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "175:" // Height 5: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 143b\n"
- "b 212f\n"
- "176:" // Height 6
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "bge 169f\n"
+ "tbz x10, #3, 164f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v25.4s }, [x20], #0x10\n"
+ "tbz x10, #2, 162f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v26.4s }, [x20], #0x10\n"
+ "tbz x10, #1, 161f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "str d27, [x20], #0x8\n"
+ "tbz x10, #0, 168f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "b 168f\n"
+ "161:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 168f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s27, [x20, #0x0]\n"
+ "b 168f\n"
+ "162:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 163f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "str d26, [x20], #0x8\n"
+ "tbz x10, #0, 168f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "b 168f\n"
+ "163:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 168f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "str s26, [x20, #0x0]\n"
+ "b 168f\n"
+ "164:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 166f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "tbz x10, #1, 165f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "str d25, [x20], #0x8\n"
+ "tbz x10, #0, 168f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "b 168f\n"
+ "165:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 168f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "str s25, [x20, #0x0]\n"
+ "b 168f\n"
+ "166:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 167f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x10, #0, 168f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "b 168f\n"
+ "167:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "str s24, [x20, #0x0]\n"
+ "168:" // Height 5: Partial direct writeback: Done
+ "b 170f\n"
+ "169:" // Height 5: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q25, [x20, #0x10]\n"
+ "str q26, [x20, #0x20]\n"
+ "str q27, [x20, #0x30]\n"
+ "170:" // Height 5: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 138b\n"
+ "b 206f\n"
+ "171:" // Height 6
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x20, #0x18\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 177f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 178f\n"
- "177:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "178:" // Height 6: Column loop
- "tbz %x[flags], #0, 188f\n"
- "cmp x15, #0x10\n"
- "bge 187f\n"
- "tbz x15, #3, 182f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x21], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x21], #0x10\n"
- "tbz x15, #2, 180f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x21], #0x10\n"
- "tbz x15, #1, 179f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x21], #0x8\n"
- "tbz x15, #0, 186f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x21]\n"
- "b 186f\n"
- "179:" // Height 6: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 186f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s31, [x21, #0x0]\n"
- "b 186f\n"
- "180:" // Height 6: Partial accumulate: partial_2_8
- "tbz x15, #1, 181f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x21], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 186f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x21]\n"
- "b 186f\n"
- "181:" // Height 6: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 186f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s30, [x21, #0x0]\n"
- "b 186f\n"
- "182:" // Height 6: Partial accumulate: partial_4_0
- "tbz x15, #2, 184f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x21], #0x10\n"
- "tbz x15, #1, 183f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x21], #0x8\n"
- "tbz x15, #0, 186f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x21]\n"
- "b 186f\n"
- "183:" // Height 6: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 186f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s29, [x21, #0x0]\n"
- "b 186f\n"
- "184:" // Height 6: Partial accumulate: partial_2_0
- "tbz x15, #1, 185f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 186f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x21]\n"
- "b 186f\n"
- "185:" // Height 6: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s28, [x21, #0x0]\n"
- "186:" // Height 6: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "sub x21, x21, x19\n"
- "b 189f\n"
- "187:" // Height 6: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x21, #0x0]\n"
- "ldr q29, [x21, #0x10]\n"
- "ldr q30, [x21, #0x20]\n"
- "ldr q31, [x21, #0x30]\n"
- "b 189f\n"
- "188:" // Height 6: no accumulate
+ "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+ "172:" // Height 6: Column loop
+ "tbz %x[flags], #0, 182f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "add x19, x20, x19, LSL #2\n"
+ "bge 181f\n"
+ "tbz x10, #3, 176f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v28.4s }, [x19], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "ld1 { v25.4s }, [x20], #0x10\n"
+ "ld1 { v29.4s }, [x19], #0x10\n"
+ "tbz x10, #2, 174f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "ld1 { v26.4s }, [x20], #0x10\n"
+ "ld1 { v30.4s }, [x19], #0x10\n"
+ "tbz x10, #1, 173f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d27, [x20], #0x8\n"
+ "ldr d31, [x19], #0x8\n"
+ "tbz x10, #0, 180f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "ld1 { v27.s }[2], [x20]\n"
+ "ld1 { v31.s }[2], [x19]\n"
+ "b 180f\n"
+ "173:" // Height 6: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 180f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "ldr s27, [x20, #0x0]\n"
+ "ldr s31, [x19, #0x0]\n"
+ "b 180f\n"
+ "174:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x10, #1, 175f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "ldr d26, [x20], #0x8\n"
+ "ldr d30, [x19], #0x8\n"
+ "tbz x10, #0, 180f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "ld1 { v26.s }[2], [x20]\n"
+ "ld1 { v30.s }[2], [x19]\n"
+ "b 180f\n"
+ "175:" // Height 6: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 180f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "ldr s26, [x20, #0x0]\n"
+ "ldr s30, [x19, #0x0]\n"
+ "b 180f\n"
+ "176:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x10, #2, 178f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v28.4s }, [x19], #0x10\n"
+ "tbz x10, #1, 177f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "ldr d29, [x19], #0x8\n"
+ "tbz x10, #0, 180f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "ld1 { v25.s }[2], [x20]\n"
+ "ld1 { v29.s }[2], [x19]\n"
+ "b 180f\n"
+ "177:" // Height 6: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 180f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "ldr s25, [x20, #0x0]\n"
+ "ldr s29, [x19, #0x0]\n"
+ "b 180f\n"
+ "178:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x10, #1, 179f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
+ "ldr d28, [x19], #0x8\n"
+ "tbz x10, #0, 180f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "ld1 { v28.s }[2], [x19]\n"
+ "b 180f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "ldr s24, [x20, #0x0]\n"
+ "ldr s28, [x19, #0x0]\n"
+ "180:" // Height 6: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 183f\n"
+ "181:" // Height 6: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "ldr q24, [x20, #0x0]\n"
+ "ldr q25, [x20, #0x10]\n"
+ "ldr q26, [x20, #0x20]\n"
+ "ldr q27, [x20, #0x30]\n"
+ "ldr q28, [x19, #0x0]\n"
+ "ldr q29, [x19, #0x10]\n"
+ "ldr q30, [x19, #0x20]\n"
+ "ldr q31, [x19, #0x30]\n"
+ "b 183f\n"
+ "182:" // Height 6: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -2738,299 +2667,299 @@ void a64_hybrid_s8s32_dot_6x16 (
"movi v29.4s, #0x0\n"
"movi v30.4s, #0x0\n"
"movi v31.4s, #0x0\n"
- "189:" // Height 6: setup done
- "mov x12, #0x0\n"
- "190:" // Height 6: String loop
+ "183:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 191f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 185f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 192f\n"
+ "cbnz x27, 186f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 192f\n"
- "191:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "192:" // Height 6: input setup done
- "cmp x11, #0x10\n"
- "blt 195f\n"
- "cmp x11, #0x20\n"
- "blt 194f\n"
- "193:" // Height 6: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
+ "b 186f\n"
+ "185:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "186:" // Height 6: input setup done
+ "cmp x26, #0x10\n"
+ "blt 189f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 188f\n"
+ "187:" // Height 6: Multiply loop: Main loop head
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x21, x21, #0x10\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x20, x20, #0x10\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
- "add x20, x20, #0x10\n"
- ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "sub x11, x11, #0x10\n"
+ "cmp x26, #0x20\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "ldr q3, [x22, #0x0]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "ldr q4, [x21, #0x0]\n"
".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
- "bge 193b\n"
- "194:" // Height 6: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "bge 187b\n"
+ "188:" // Height 6: Multiply loop: Single iteration only
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
- ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"add x24, x24, #0x10\n"
- ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
- ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"add x22, x22, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x9, #0x20]\n"
"add x20, x20, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
@@ -3042,37 +2971,37 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
- "195:" // Height 6: Multiply loop: Main loop skip
- "cbz x11, 200f\n"
- "cmp x11, #0x4\n"
- "blt 197f\n"
- "196:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
+ "189:" // Height 6: Multiply loop: Main loop skip
+ "cbz x26, 194f\n"
+ "cmp x26, #0x4\n"
+ "blt 191f\n"
+ "190:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
"ldr s5, [x20], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -3084,50 +3013,50 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "bge 196b\n"
- "cbz x11, 200f\n"
- "197:" // Height 6: Multiply loop: Skip odd blocks
- "tbz x11, #1, 198f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x22], #0x2\n"
+ "bge 190b\n"
+ "cbz x26, 194f\n"
+ "191:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 192f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h4, [x21], #0x2\n"
"ldr h5, [x20], #0x2\n"
- "tbz x11, #0, 199f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x22]\n"
+ "tbz x26, #0, 193f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "ld1 { v3.b }[2], [x22]\n"
+ "ld1 { v4.b }[2], [x21]\n"
"ld1 { v5.b }[2], [x20]\n"
- "b 199f\n"
- "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "ldr b4, [x22, #0x0]\n"
+ "b 193f\n"
+ "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "ldr b3, [x22, #0x0]\n"
+ "ldr b4, [x21, #0x0]\n"
"ldr b5, [x20, #0x0]\n"
- "199:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "193:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
@@ -3139,195 +3068,196 @@ void a64_hybrid_s8s32_dot_6x16 (
".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
- "200:" // Height 6: Multiply loop: No odd multiplies
+ "194:" // Height 6: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 190b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 184b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "bge 209f\n"
- "tbz x15, #3, 204f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x21], #0x10\n"
- "st1 { v29.4s }, [x21], #0x10\n"
- "tbz x15, #2, 202f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x21], #0x10\n"
- "tbz x15, #1, 201f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x21], #0x8\n"
- "tbz x15, #0, 208f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x21]\n"
- "b 208f\n"
- "201:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x15, #0, 208f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s31, [x21, #0x0]\n"
- "b 208f\n"
- "202:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x15, #1, 203f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x21], #0x8\n"
- "tbz x15, #0, 208f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x21]\n"
- "b 208f\n"
- "203:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x15, #0, 208f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s30, [x21, #0x0]\n"
- "b 208f\n"
- "204:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x15, #2, 206f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x21], #0x10\n"
- "tbz x15, #1, 205f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x21], #0x8\n"
- "tbz x15, #0, 208f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x21]\n"
- "b 208f\n"
- "205:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x15, #0, 208f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s29, [x21, #0x0]\n"
- "b 208f\n"
- "206:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x15, #1, 207f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x21], #0x8\n"
- "tbz x15, #0, 208f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x21]\n"
- "b 208f\n"
- "207:" // Height 6: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s28, [x21, #0x0]\n"
- "208:" // Height 6: Partial direct writeback: Done
- "b 210f\n"
- "209:" // Height 6: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x21, #0x0]\n"
- "str q29, [x21, #0x10]\n"
- "str q30, [x21, #0x20]\n"
- "str q31, [x21, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "add x21, x21, #0x40\n"
- "210:" // Height 6: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 178b\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19, LSL #2\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
+ "bge 203f\n"
+ "tbz x10, #3, 198f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v25.4s }, [x20], #0x10\n"
+ "st1 { v28.4s }, [x19], #0x10\n"
+ "st1 { v29.4s }, [x19], #0x10\n"
+ "tbz x10, #2, 196f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v26.4s }, [x20], #0x10\n"
+ "st1 { v30.4s }, [x19], #0x10\n"
+ "tbz x10, #1, 195f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "str d27, [x20], #0x8\n"
+ "str d31, [x19], #0x8\n"
+ "tbz x10, #0, 202f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "st1 { v31.s }[2], [x19]\n"
+ "b 202f\n"
+ "195:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 202f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s27, [x20, #0x0]\n"
+ "str s31, [x19, #0x0]\n"
+ "b 202f\n"
+ "196:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 197f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "str d26, [x20], #0x8\n"
+ "str d30, [x19], #0x8\n"
+ "tbz x10, #0, 202f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "st1 { v30.s }[2], [x19]\n"
+ "b 202f\n"
+ "197:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 202f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "str s26, [x20, #0x0]\n"
+ "str s30, [x19, #0x0]\n"
+ "b 202f\n"
+ "198:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 200f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v28.4s }, [x19], #0x10\n"
+ "tbz x10, #1, 199f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "str d25, [x20], #0x8\n"
+ "str d29, [x19], #0x8\n"
+ "tbz x10, #0, 202f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "st1 { v29.s }[2], [x19]\n"
+ "b 202f\n"
+ "199:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 202f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "str s25, [x20, #0x0]\n"
+ "str s29, [x19, #0x0]\n"
+ "b 202f\n"
+ "200:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 201f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x10, #0, 202f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "b 202f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "str s24, [x20, #0x0]\n"
+ "str s28, [x19, #0x0]\n"
+ "202:" // Height 6: Partial direct writeback: Done
+ "b 204f\n"
+ "203:" // Height 6: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q25, [x20, #0x10]\n"
+ "str q26, [x20, #0x20]\n"
+ "str q27, [x20, #0x30]\n"
+ "str q28, [x19, #0x0]\n"
+ "str q29, [x19, #0x10]\n"
+ "str q30, [x19, #0x20]\n"
+ "str q31, [x19, #0x30]\n"
+ "204:" // Height 6: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 172b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 212f\n"
+ "beq 206f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 211f\n"
+ "tbz %x[flags], #3, 205f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "211:" // Update direct input
+ "205:" // Update direct input
"mov x19, #0x6\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "212:" // Exit
+ "206:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index acf46205a3..5d9d84815a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -37,9 +37,9 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void a64_hybrid_u8qa_dot_4x16( ARGLIST );
+void a64_hybrid_u8qa_dot_4x16_a55( ARGLIST );
class cls_a64_hybrid_u8qa_dot_4x16
{
@@ -72,7 +72,8 @@ public:
StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
- static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+ {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
return { 7.5301 };
@@ -83,9 +84,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_u8qa_dot_4x16;
-
- cls_a64_hybrid_u8qa_dot_4x16(const CPUInfo *)
+ cls_a64_hybrid_u8qa_dot_4x16(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A55r1:
+ kernel=a64_hybrid_u8qa_dot_4x16_a55;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
new file mode 100644
index 0000000000..690085cb3b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
@@ -0,0 +1,2148 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8qa_dot_4x16_a55 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 91f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 61f\n"
+ "beq 31f\n"
+ "movi v11.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[col_bias]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "3:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "4:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 5f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 6f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x25, x25, x19\n"
+ "b 6f\n"
+ "5:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "6:" // Height 1: input setup done
+ "cmp x26, #0x10\n"
+ "blt 11f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "blt 9f\n"
+ "7:" // Height 1: Multiply loop: Main loop head
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr d5, [x10, #0x10]\n"
+ "ldr x24, [x10, #0x18]\n"
+ "add x25, x25, #0x10\n"
+ "ldr d6, [x10, #0x20]\n"
+ "ldr x23, [x10, #0x28]\n"
+ "mov v5.d[1], x24\n"
+ "ldr d7, [x10, #0x30]\n"
+ "ldr x19, [x10, #0x38]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr d8, [x10, #0x40]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr x23, [x10, #0x48]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr d9, [x10, #0x50]\n"
+ "ldr x19, [x10, #0x58]\n"
+ "mov v8.d[1], x23\n"
+ "ldr d10, [x10, #0x60]\n"
+ "ldr x23, [x10, #0x68]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr d4, [x10, #0x70]\n"
+ "mov v10.d[1], x23\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr x19, [x10, #0x78]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ "ldr d5, [x10, #0x80]\n"
+ "ldr x24, [x10, #0x88]\n"
+ "mov v4.d[1], x19\n"
+ "ldr d6, [x10, #0x90]\n"
+ "ldr x23, [x10, #0x98]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr d7, [x10, #0xa0]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ "ldr x19, [x10, #0xa8]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr d8, [x10, #0xb0]\n"
+ "ldr x23, [x10, #0xb8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d9, [x10, #0xc0]\n"
+ "ldr x19, [x10, #0xc8]\n"
+ "mov v8.d[1], x23\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ "ldr d10, [x10, #0xd0]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ "ldr x23, [x10, #0xd8]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ "ldr d4, [x10, #0xe0]\n"
+ "ldr x19, [x10, #0xe8]\n"
+ "mov v10.d[1], x23\n"
+ "ldr d5, [x10, #0xf0]\n"
+ "ldr x24, [x10, #0xf8]\n"
+ "add x10, x10, #0x100\n"
+ "mov v4.d[1], x19\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 8f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "8:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q0, [x25, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q4, [x10, #0x0]\n"
+ "bge 7b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x10, #0x10]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q8, [x10, #0x40]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q9, [x10, #0x50]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x10, #0x60]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q4, [x10, #0x70]\n"
+ "ldr q5, [x10, #0x80]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ "ldr q7, [x10, #0xa0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr q8, [x10, #0xb0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ "ldr q9, [x10, #0xc0]\n"
+ "ldr q10, [x10, #0xd0]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q4, [x10, #0xe0]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ "ldr q5, [x10, #0xf0]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 10f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "10:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "11:" // Height 1: Multiply loop: Main loop skip
+ "cbz x26, 18f\n"
+ "cmp x26, #0x4\n"
+ "blt 14f\n"
+ "12:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "tbnz %x[flags], #31, 13f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "13:" // Height 1: Multiply loop: unique 3: skip row sum
+ "ldr q6, [x10, #0x0]\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q7, [x10, #0x10]\n"
+ "cmp x26, #0x4\n"
+ "ldr q8, [x10, #0x20]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q9, [x10, #0x30]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ "bge 12b\n"
+ "cbz x26, 18f\n"
+ "14:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 15f\n"
+ "ldr h0, [x25], #0x2\n"
+ "tbz x26, #0, 16f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "b 16f\n"
+ "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 17f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "17:" // Height 1: Multiply loop: unique 4: skip row sum
+ "ldr q10, [x10, #0x0]\n"
+ "ldr q4, [x10, #0x10]\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
+ ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
+ "18:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 4b\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "tbnz %x[flags], #31, 19f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v1.4s }, [x22]\n"
+ "neg v1.4s, v1.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "mul v11.4s, v11.4s, v1.4s\n"
+ "19:" // Height 1: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q0, [x9, #0x0]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "ldr q1, [x9, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ldr q2, [x9, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "ldr q3, [x9, #0x30]\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add x9, x9, #0x40\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "tbz %x[flags], #5, 20f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "20:" // Height 1: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "add x23, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x23]\n"
+ "cmp x11, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "bge 29f\n"
+ "tbz x11, #3, 24f\n"
+ "str d16, [x28], #0x8\n"
+ "tbz x11, #2, 22f\n"
+ "st1 { v16.s }[2], [x28], #0x4\n"
+ "tbz x11, #1, 21f\n"
+ "st1 { v16.h }[6], [x28], #0x2\n"
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[14], [x28]\n"
+ "b 28f\n"
+ "21:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[12], [x28]\n"
+ "b 28f\n"
+ "22:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 23f\n"
+ "st1 { v16.h }[4], [x28], #0x2\n"
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[10], [x28]\n"
+ "b 28f\n"
+ "23:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[8], [x28]\n"
+ "b 28f\n"
+ "24:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 26f\n"
+ "str s16, [x28], #0x4\n"
+ "tbz x11, #1, 25f\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[6], [x28]\n"
+ "b 28f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[4], [x28]\n"
+ "b 28f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 27f\n"
+ "str h16, [x28], #0x2\n"
+ "tbz x11, #0, 28f\n"
+ "st1 { v16.b }[2], [x28]\n"
+ "b 28f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_0
+ "str b16, [x28, #0x0]\n"
+ "28:" // Height 1: Partial direct writeback: Done
+ "b 30f\n"
+ "29:" // Height 1: Full writeback
+ "str q16, [x28, #0x0]\n"
+ "add x28, x28, #0x10\n"
+ "30:" // Height 1: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 2b\n"
+ "b 122f\n"
+ "31:" // Height 2
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[col_bias]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x28, %x[output_ptr]\n"
+ "32:" // Height 2: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "33:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "34:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 35f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "cbnz x27, 36f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x25, x25, x19\n"
+ "add x22, x22, x19\n"
+ "b 36f\n"
+ "35:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x22, x25, x19\n"
+ "36:" // Height 2: input setup done
+ "cmp x26, #0x10\n"
+ "blt 41f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q4, [x10, #0x0]\n"
+ "blt 39f\n"
+ "37:" // Height 2: Multiply loop: Main loop head
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr d5, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
+ "ldr x24, [x10, #0x18]\n"
+ "add x22, x22, #0x10\n"
+ "ldr d6, [x10, #0x20]\n"
+ "ldr x23, [x10, #0x28]\n"
+ "mov v5.d[1], x24\n"
+ "ldr d7, [x10, #0x30]\n"
+ "ldr x19, [x10, #0x38]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr d8, [x10, #0x40]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr x23, [x10, #0x48]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr d9, [x10, #0x50]\n"
+ "mov v8.d[1], x23\n"
+ "ldr x19, [x10, #0x58]\n"
+ "ldr d10, [x10, #0x60]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ "mov v9.d[1], x19\n"
+ "ldr x23, [x10, #0x68]\n"
+ "ldr d4, [x10, #0x70]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ "mov v10.d[1], x23\n"
+ "ldr x19, [x10, #0x78]\n"
+ "ldr d5, [x10, #0x80]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ "mov v4.d[1], x19\n"
+ "ldr x24, [x10, #0x88]\n"
+ "ldr d6, [x10, #0x90]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ "mov v5.d[1], x24\n"
+ "ldr x23, [x10, #0x98]\n"
+ "ldr d7, [x10, #0xa0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ "mov v6.d[1], x23\n"
+ "ldr x19, [x10, #0xa8]\n"
+ "ldr d8, [x10, #0xb0]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x23, [x10, #0xb8]\n"
+ "ldr d9, [x10, #0xc0]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ "mov v8.d[1], x23\n"
+ "ldr x19, [x10, #0xc8]\n"
+ "ldr d10, [x10, #0xd0]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ "mov v9.d[1], x19\n"
+ "ldr x23, [x10, #0xd8]\n"
+ "ldr d4, [x10, #0xe0]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ "mov v10.d[1], x23\n"
+ "ldr x19, [x10, #0xe8]\n"
+ "ldr d5, [x10, #0xf0]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ "mov v4.d[1], x19\n"
+ "ldr x24, [x10, #0xf8]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 38f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "38:" // Height 2: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x20\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "bge 37b\n"
+ "39:" // Height 2: Multiply loop: Single iteration only
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q5, [x10, #0x10]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr q8, [x10, #0x40]\n"
+ "ldr q9, [x10, #0x50]\n"
+ "ldr q10, [x10, #0x60]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ "ldr q4, [x10, #0x70]\n"
+ "ldr q5, [x10, #0x80]\n"
+ "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ "ldr q7, [x10, #0xa0]\n"
+ "ldr q8, [x10, #0xb0]\n"
+ "ldr q9, [x10, #0xc0]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ "ldr q10, [x10, #0xd0]\n"
+ "ldr q4, [x10, #0xe0]\n"
+ "ldr q5, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 40f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "40:" // Height 2: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "41:" // Height 2: Multiply loop: Main loop skip
+ "cbz x26, 48f\n"
+ "cmp x26, #0x4\n"
+ "blt 44f\n"
+ "42:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "tbnz %x[flags], #31, 43f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "43:" // Height 2: Multiply loop: unique 7: skip row sum
+ "ldr q6, [x10, #0x0]\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q7, [x10, #0x10]\n"
+ "cmp x26, #0x4\n"
+ "ldr q8, [x10, #0x20]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ "ldr q9, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ "bge 42b\n"
+ "cbz x26, 48f\n"
+ "44:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 45f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "tbz x26, #0, 46f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "b 46f\n"
+ "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "46:" // Height 2: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 47f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "47:" // Height 2: Multiply loop: unique 8: skip row sum
+ "ldr q10, [x10, #0x0]\n"
+ "ldr q4, [x10, #0x10]\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
+ "48:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 34b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x21, x28, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 49f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x22]\n"
+ "neg v2.4s, v2.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "mul v11.4s, v11.4s, v2.4s\n"
+ "mul v12.4s, v12.4s, v2.4s\n"
+ "49:" // Height 2: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "ldr q0, [x9, #0x0]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "ldr q1, [x9, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ldr q2, [x9, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "ldr q3, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "tbz %x[flags], #5, 50f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "50:" // Height 2: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "add x23, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x23]\n"
+ "cmp x11, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "bge 59f\n"
+ "tbz x11, #3, 54f\n"
+ "str d16, [x28], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x11, #2, 52f\n"
+ "st1 { v16.s }[2], [x28], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "tbz x11, #1, 51f\n"
+ "st1 { v16.h }[6], [x28], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[14], [x28]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "b 58f\n"
+ "51:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[12], [x28]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "b 58f\n"
+ "52:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 53f\n"
+ "st1 { v16.h }[4], [x28], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[10], [x28]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "b 58f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[8], [x28]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "b 58f\n"
+ "54:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 56f\n"
+ "str s16, [x28], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "tbz x11, #1, 55f\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[6], [x28]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "b 58f\n"
+ "55:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[4], [x28]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "b 58f\n"
+ "56:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 57f\n"
+ "str h16, [x28], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "tbz x11, #0, 58f\n"
+ "st1 { v16.b }[2], [x28]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "b 58f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_0
+ "str b16, [x28, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "58:" // Height 2: Partial direct writeback: Done
+ "b 60f\n"
+ "59:" // Height 2: Full writeback
+ "str q16, [x28, #0x0]\n"
+ "add x28, x28, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "60:" // Height 2: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 32b\n"
+ "b 122f\n"
+ "61:" // Height 3
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[col_bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x28, %x[output_ptr]\n"
+ "62:" // Height 3: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "63:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "64:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 65f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
+ "cbnz x27, 66f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x25, x25, x19\n"
+ "add x22, x22, x19\n"
+ "add x21, x21, x19\n"
+ "b 66f\n"
+ "65:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x22, x25, x19\n"
+ "add x21, x22, x19\n"
+ "66:" // Height 3: input setup done
+ "cmp x26, #0x10\n"
+ "blt 71f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "blt 69f\n"
+ "67:" // Height 3: Multiply loop: Main loop head
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr d5, [x10, #0x10]\n"
+ "ldr x24, [x10, #0x18]\n"
+ "add x25, x25, #0x10\n"
+ "ldr d6, [x10, #0x20]\n"
+ "add x22, x22, #0x10\n"
+ "ldr x23, [x10, #0x28]\n"
+ "add x21, x21, #0x10\n"
+ "mov v5.d[1], x24\n"
+ "ldr d7, [x10, #0x30]\n"
+ "ldr x19, [x10, #0x38]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr d8, [x10, #0x40]\n"
+ "ldr x23, [x10, #0x48]\n"
+ "ldr d9, [x10, #0x50]\n"
+ "ldr x19, [x10, #0x58]\n"
+ "mov v8.d[1], x23\n"
+ "ldr d10, [x10, #0x60]\n"
+ "ldr x23, [x10, #0x68]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ "mov v10.d[1], x23\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr d4, [x10, #0x70]\n"
+ "ldr x19, [x10, #0x78]\n"
+ "ldr d5, [x10, #0x80]\n"
+ "ldr x24, [x10, #0x88]\n"
+ "mov v4.d[1], x19\n"
+ "ldr d6, [x10, #0x90]\n"
+ "ldr x23, [x10, #0x98]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ "ldr d7, [x10, #0xa0]\n"
+ "ldr x19, [x10, #0xa8]\n"
+ "ldr d8, [x10, #0xb0]\n"
+ "ldr x23, [x10, #0xb8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d9, [x10, #0xc0]\n"
+ "ldr x19, [x10, #0xc8]\n"
+ "mov v8.d[1], x23\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ "ldr d10, [x10, #0xd0]\n"
+ "ldr x23, [x10, #0xd8]\n"
+ "ldr d4, [x10, #0xe0]\n"
+ "ldr x19, [x10, #0xe8]\n"
+ "mov v10.d[1], x23\n"
+ "ldr d5, [x10, #0xf0]\n"
+ "ldr x24, [x10, #0xf8]\n"
+ "add x10, x10, #0x100\n"
+ "mov v4.d[1], x19\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 68f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "68:" // Height 3: Multiply loop: unique 9: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x20\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "bge 67b\n"
+ "69:" // Height 3: Multiply loop: Single iteration only
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q5, [x10, #0x10]\n"
+ "ldr q6, [x10, #0x20]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x25, x25, #0x10\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr q8, [x10, #0x40]\n"
+ "ldr q9, [x10, #0x50]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q10, [x10, #0x60]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr q4, [x10, #0x70]\n"
+ "ldr q5, [x10, #0x80]\n"
+ "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ "ldr q7, [x10, #0xa0]\n"
+ "ldr q8, [x10, #0xb0]\n"
+ "ldr q9, [x10, #0xc0]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ "ldr q10, [x10, #0xd0]\n"
+ "ldr q4, [x10, #0xe0]\n"
+ "ldr q5, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 70f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "70:" // Height 3: Multiply loop: unique 10: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "71:" // Height 3: Multiply loop: Main loop skip
+ "cbz x26, 78f\n"
+ "cmp x26, #0x4\n"
+ "blt 74f\n"
+ "72:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "tbnz %x[flags], #31, 73f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "73:" // Height 3: Multiply loop: unique 11: skip row sum
+ "ldr q6, [x10, #0x0]\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q7, [x10, #0x10]\n"
+ "cmp x26, #0x4\n"
+ "ldr q8, [x10, #0x20]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
+ "ldr q9, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
+ "bge 72b\n"
+ "cbz x26, 78f\n"
+ "74:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 75f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
+ "tbz x26, #0, 76f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v2.b }[2], [x21]\n"
+ "b 76f\n"
+ "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "ldr b2, [x21, #0x0]\n"
+ "76:" // Height 3: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 77f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "77:" // Height 3: Multiply loop: unique 12: skip row sum
+ "ldr q10, [x10, #0x0]\n"
+ "ldr q4, [x10, #0x10]\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n"
+ "78:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 64b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x21, x28, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "tbnz %x[flags], #31, 79f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v3.4s }, [x22]\n"
+ "neg v3.4s, v3.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "mul v11.4s, v11.4s, v3.4s\n"
+ "mul v12.4s, v12.4s, v3.4s\n"
+ "mul v13.4s, v13.4s, v3.4s\n"
+ "79:" // Height 3: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "ldr q0, [x9, #0x0]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "ldr q1, [x9, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ldr q2, [x9, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "ldr q3, [x9, #0x30]\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "add x9, x9, #0x40\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "tbz %x[flags], #5, 80f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "80:" // Height 3: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "add x23, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x23]\n"
+ "cmp x11, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "bge 89f\n"
+ "tbz x11, #3, 84f\n"
+ "str d16, [x28], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x11, #2, 82f\n"
+ "st1 { v16.s }[2], [x28], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "tbz x11, #1, 81f\n"
+ "st1 { v16.h }[6], [x28], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[14], [x28]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "b 88f\n"
+ "81:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[12], [x28]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "b 88f\n"
+ "82:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 83f\n"
+ "st1 { v16.h }[4], [x28], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[10], [x28]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "b 88f\n"
+ "83:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[8], [x28]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "b 88f\n"
+ "84:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 86f\n"
+ "str s16, [x28], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "tbz x11, #1, 85f\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[6], [x28]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "b 88f\n"
+ "85:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[4], [x28]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "b 88f\n"
+ "86:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 87f\n"
+ "str h16, [x28], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "tbz x11, #0, 88f\n"
+ "st1 { v16.b }[2], [x28]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "b 88f\n"
+ "87:" // Height 3: Partial direct writeback: partial_1_0
+ "str b16, [x28, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "88:" // Height 3: Partial direct writeback: Done
+ "b 90f\n"
+ "89:" // Height 3: Full writeback
+ "str q16, [x28, #0x0]\n"
+ "add x28, x28, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "90:" // Height 3: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 62b\n"
+ "b 122f\n"
+ "91:" // Height 4
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[col_bias]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x28, %x[output_ptr]\n"
+ "mov x19, #0x4\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "92:" // Height 4: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "93:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "94:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 95f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x27, 96f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x25, x25, x19\n"
+ "add x22, x22, x19\n"
+ "add x21, x21, x19\n"
+ "add x20, x20, x19\n"
+ "b 96f\n"
+ "95:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x22, x25, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "96:" // Height 4: input setup done
+ "cmp x26, #0x10\n"
+ "blt 101f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "blt 99f\n"
+ "97:" // Height 4: Multiply loop: Main loop head
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr d5, [x10, #0x10]\n"
+ "add x25, x25, #0x10\n"
+ "ldr x24, [x10, #0x18]\n"
+ "add x22, x22, #0x10\n"
+ "ldr d6, [x10, #0x20]\n"
+ "add x21, x21, #0x10\n"
+ "ldr x23, [x10, #0x28]\n"
+ "add x20, x20, #0x10\n"
+ "mov v5.d[1], x24\n"
+ "ldr d7, [x10, #0x30]\n"
+ "ldr x19, [x10, #0x38]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr d8, [x10, #0x40]\n"
+ "ldr x23, [x10, #0x48]\n"
+ "ldr d9, [x10, #0x50]\n"
+ "ldr x19, [x10, #0x58]\n"
+ "mov v8.d[1], x23\n"
+ "ldr d10, [x10, #0x60]\n"
+ "ldr x23, [x10, #0x68]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ "mov v10.d[1], x23\n"
+ ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr d4, [x10, #0x70]\n"
+ "ldr x19, [x10, #0x78]\n"
+ "ldr d5, [x10, #0x80]\n"
+ "ldr x24, [x10, #0x88]\n"
+ "mov v4.d[1], x19\n"
+ "ldr d6, [x10, #0x90]\n"
+ "ldr x23, [x10, #0x98]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ "mov v6.d[1], x23\n"
+ ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
+ "ldr d7, [x10, #0xa0]\n"
+ "ldr x19, [x10, #0xa8]\n"
+ "ldr d8, [x10, #0xb0]\n"
+ "ldr x23, [x10, #0xb8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d9, [x10, #0xc0]\n"
+ "ldr x19, [x10, #0xc8]\n"
+ "mov v8.d[1], x23\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ "mov v9.d[1], x19\n"
+ ".inst 0x6f83e8fe // udot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e93c // udot v28.4s, v9.16b, v3.4b[3]\n"
+ "ldr d10, [x10, #0xd0]\n"
+ "ldr x23, [x10, #0xd8]\n"
+ "ldr d4, [x10, #0xe0]\n"
+ "ldr x19, [x10, #0xe8]\n"
+ "mov v10.d[1], x23\n"
+ "ldr d5, [x10, #0xf0]\n"
+ "ldr x24, [x10, #0xf8]\n"
+ "add x10, x10, #0x100\n"
+ "mov v4.d[1], x19\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ "mov v5.d[1], x24\n"
+ ".inst 0x6fa3e95d // udot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e89e // udot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 98f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "98:" // Height 4: Multiply loop: unique 13: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x20\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q4, [x10, #0x0]\n"
+ "bge 97b\n"
+ "99:" // Height 4: Multiply loop: Single iteration only
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr q5, [x10, #0x10]\n"
+ "sub x26, x26, #0x10\n"
+ "ldr q6, [x10, #0x20]\n"
+ "add x25, x25, #0x10\n"
+ "ldr q7, [x10, #0x30]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr q8, [x10, #0x40]\n"
+ "add x21, x21, #0x10\n"
+ "ldr q9, [x10, #0x50]\n"
+ "add x20, x20, #0x10\n"
+ "ldr q10, [x10, #0x60]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr q4, [x10, #0x70]\n"
+ "ldr q5, [x10, #0x80]\n"
+ "ldr q6, [x10, #0x90]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
+ "ldr q7, [x10, #0xa0]\n"
+ "ldr q8, [x10, #0xb0]\n"
+ "ldr q9, [x10, #0xc0]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8fe // udot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e93c // udot v28.4s, v9.16b, v3.4b[3]\n"
+ "ldr q10, [x10, #0xd0]\n"
+ "ldr q4, [x10, #0xe0]\n"
+ "ldr q5, [x10, #0xf0]\n"
+ "add x10, x10, #0x100\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e95d // udot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e89e // udot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 100f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "100:" // Height 4: Multiply loop: unique 14: skip row sum
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "101:" // Height 4: Multiply loop: Main loop skip
+ "cbz x26, 108f\n"
+ "cmp x26, #0x4\n"
+ "blt 104f\n"
+ "102:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr s3, [x20], #0x4\n"
+ "tbnz %x[flags], #31, 103f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "103:" // Height 4: Multiply loop: unique 15: skip row sum
+ "ldr q6, [x10, #0x0]\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q7, [x10, #0x10]\n"
+ "cmp x26, #0x4\n"
+ "ldr q8, [x10, #0x20]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
+ "ldr q9, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n"
+ "bge 102b\n"
+ "cbz x26, 108f\n"
+ "104:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 105f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
+ "ldr h3, [x20], #0x2\n"
+ "tbz x26, #0, 106f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v2.b }[2], [x21]\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 106f\n"
+ "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "ldr b2, [x21, #0x0]\n"
+ "ldr b3, [x20, #0x0]\n"
+ "106:" // Height 4: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 107f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "107:" // Height 4: Multiply loop: unique 16: skip row sum
+ "ldr q10, [x10, #0x0]\n"
+ "ldr q4, [x10, #0x10]\n"
+ "ldr q5, [x10, #0x20]\n"
+ ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n"
+ "ldr q6, [x10, #0x30]\n"
+ "add x10, x10, #0x40\n"
+ ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n"
+ "108:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 94b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "add x21, x28, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
+ "tbnz %x[flags], #31, 109f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "neg v4.4s, v4.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "mul v11.4s, v11.4s, v4.4s\n"
+ "mul v12.4s, v12.4s, v4.4s\n"
+ "mul v13.4s, v13.4s, v4.4s\n"
+ "mul v14.4s, v14.4s, v4.4s\n"
+ "109:" // Height 4: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "ldr q0, [x9, #0x0]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "ldr q1, [x9, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
+ "ldr q2, [x9, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
+ "ldr q3, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "tbz %x[flags], #5, 110f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "and v9.16b, v28.16b, v0.16b\n"
+ "and v10.16b, v29.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v4.16b, v30.16b, v0.16b\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "and v5.16b, v31.16b, v0.16b\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "sqadd v28.4s, v28.4s, v9.4s\n"
+ "sqadd v29.4s, v29.4s, v10.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v5.4s\n"
+ "110:" // Height 4: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "srshl v28.4s, v28.4s, v0.4s\n"
+ "srshl v29.4s, v29.4s, v0.4s\n"
+ "srshl v30.4s, v30.4s, v0.4s\n"
+ "srshl v31.4s, v31.4s, v0.4s\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "add x23, %x[qp], %[minval]\n"
+ "ld1r { v4.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x23]\n"
+ "cmp x11, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "smin v28.4s, v28.4s, v6.4s\n"
+ "smin v29.4s, v29.4s, v6.4s\n"
+ "smin v30.4s, v30.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v6.4s\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "smax v28.4s, v28.4s, v5.4s\n"
+ "smax v29.4s, v29.4s, v5.4s\n"
+ "smax v30.4s, v30.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v5.4s\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v28.8h, v28.8h, v29.8h\n"
+ "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "bge 119f\n"
+ "tbz x11, #3, 114f\n"
+ "str d16, [x28], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x11, #2, 112f\n"
+ "st1 { v16.s }[2], [x28], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "st1 { v28.s }[2], [x19], #0x4\n"
+ "tbz x11, #1, 111f\n"
+ "st1 { v16.h }[6], [x28], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "st1 { v28.h }[6], [x19], #0x2\n"
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[14], [x28]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "st1 { v28.b }[14], [x19]\n"
+ "b 118f\n"
+ "111:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[12], [x28]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "st1 { v28.b }[12], [x19]\n"
+ "b 118f\n"
+ "112:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x11, #1, 113f\n"
+ "st1 { v16.h }[4], [x28], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "st1 { v28.h }[4], [x19], #0x2\n"
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[10], [x28]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "st1 { v28.b }[10], [x19]\n"
+ "b 118f\n"
+ "113:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[8], [x28]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "st1 { v28.b }[8], [x19]\n"
+ "b 118f\n"
+ "114:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x11, #2, 116f\n"
+ "str s16, [x28], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "str s28, [x19], #0x4\n"
+ "tbz x11, #1, 115f\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "st1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[6], [x28]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "st1 { v28.b }[6], [x19]\n"
+ "b 118f\n"
+ "115:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[4], [x28]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "st1 { v28.b }[4], [x19]\n"
+ "b 118f\n"
+ "116:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x11, #1, 117f\n"
+ "str h16, [x28], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "str h28, [x19], #0x2\n"
+ "tbz x11, #0, 118f\n"
+ "st1 { v16.b }[2], [x28]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "st1 { v28.b }[2], [x19]\n"
+ "b 118f\n"
+ "117:" // Height 4: Partial direct writeback: partial_1_0
+ "str b16, [x28, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "str b28, [x19, #0x0]\n"
+ "118:" // Height 4: Partial direct writeback: Done
+ "b 120f\n"
+ "119:" // Height 4: Full writeback
+ "str q16, [x28, #0x0]\n"
+ "add x28, x28, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q28, [x19, #0x0]\n"
+ "120:" // Height 4: Writeback done
+ "subs x11, x11, #0x10\n"
+ "bgt 92b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 122f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 121f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "121:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "122:" // Exit
+
+ : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
index 2b80285f57..6e85eec204 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -81,216 +81,207 @@ void a64_hybrid_u8qa_dot_4x16 (
"1:" // Row loop
"cmp %x[M], #0x4\n"
- "bge 94f\n"
+ "bge 91f\n"
"cmp %x[M], #0x2\n"
- "bgt 63f\n"
- "beq 32f\n"
+ "bgt 61f\n"
+ "beq 31f\n"
"movi v11.4s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "movi v12.4s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x10, %x[col_bias]\n"
- "movi v13.4s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "movi v14.4s, #0x0\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"movi v15.16b, #0x1\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "add x9, x9, x19\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x9, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[col_bias]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x26, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
"movi v19.4s, #0x0\n"
- "4:" // Height 1: setup done
- "mov x28, #0x0\n"
- "5:" // Height 1: String loop
+ "3:" // Height 1: setup done
+ "mov x25, #0x0\n"
+ "4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 6f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 5f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "cbnz x28, 7f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "cbnz x25, 6f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "b 7f\n"
- "6:" // Height 1: setup direct input
- "mov x26, %x[input_ptr]\n"
- "7:" // Height 1: input setup done
- "cmp x27, #0x10\n"
- "blt 12f\n"
- "cmp x27, #0x20\n"
- "blt 10f\n"
- "8:" // Height 1: Multiply loop: Main loop head
- "ldr q0, [x26, #0x0]\n"
- "ldr q4, [x11, #0x0]\n"
+ "add x23, x23, x19\n"
+ "b 6f\n"
+ "5:" // Height 1: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "6:" // Height 1: input setup done
+ "cmp x24, #0x10\n"
+ "blt 11f\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "blt 9f\n"
+ "7:" // Height 1: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x10]\n"
- "ldr q6, [x11, #0x20]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x30]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q8, [x11, #0x40]\n"
- "ldr q9, [x11, #0x50]\n"
+ "ldr q8, [x28, #0x40]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x60]\n"
- "ldr q4, [x11, #0x70]\n"
+ "ldr q9, [x28, #0x50]\n"
+ "ldr q10, [x28, #0x60]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q4, [x28, #0x70]\n"
+ "ldr q5, [x28, #0x80]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "ldr q5, [x11, #0x80]\n"
- "ldr q6, [x11, #0x90]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "ldr q7, [x11, #0xa0]\n"
+ "ldr q6, [x28, #0x90]\n"
+ "ldr q7, [x28, #0xa0]\n"
".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
- "ldr q8, [x11, #0xb0]\n"
- "ldr q9, [x11, #0xc0]\n"
+ "ldr q8, [x28, #0xb0]\n"
".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ "ldr q10, [x28, #0xd0]\n"
".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "ldr q10, [x11, #0xd0]\n"
- "ldr q4, [x11, #0xe0]\n"
".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
- "ldr q5, [x11, #0xf0]\n"
- "add x26, x26, #0x10\n"
+ "ldr q4, [x28, #0xe0]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ "add x28, x28, #0x100\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 8f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "8:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "ldr q0, [x23, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "ldr q4, [x28, #0x0]\n"
+ "bge 7b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "sub x24, x24, #0x10\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "ldr q8, [x28, #0x40]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q9, [x28, #0x50]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q10, [x28, #0x60]\n"
+ "ldr q4, [x28, #0x70]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q5, [x28, #0x80]\n"
+ "ldr q6, [x28, #0x90]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr q7, [x28, #0xa0]\n"
+ "ldr q8, [x28, #0xb0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q10, [x28, #0xd0]\n"
+ "ldr q4, [x28, #0xe0]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
- "add x11, x11, #0x100\n"
".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- "tbnz %x[flags], #31, 9f\n"
+ "tbnz %x[flags], #31, 10f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "9:" // Height 1: Multiply loop: unique 1: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x20\n"
- "bge 8b\n"
- "10:" // Height 1: Multiply loop: Single iteration only
- "sub x27, x27, #0x10\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q6, [x11, #0x0]\n"
+ "10:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "11:" // Height 1: Multiply loop: Main loop skip
+ "cbz x24, 18f\n"
+ "cmp x24, #0x4\n"
+ "blt 14f\n"
+ "12:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x23], #0x4\n"
+ "tbnz %x[flags], #31, 13f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "13:" // Height 1: Multiply loop: unique 3: skip row sum
+ "ldr q6, [x28, #0x0]\n"
".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x10]\n"
- "ldr q8, [x11, #0x20]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x24, x24, #0x4\n"
".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x30]\n"
+ "ldr q8, [x28, #0x20]\n"
+ "cmp x24, #0x4\n"
".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x40]\n"
- "ldr q4, [x11, #0x50]\n"
+ "ldr q9, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x60]\n"
- "ldr q6, [x11, #0x70]\n"
- ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
- "ldr q7, [x11, #0x80]\n"
- "ldr q8, [x11, #0x90]\n"
- ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
- "ldr q9, [x11, #0xa0]\n"
- ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
- "ldr q10, [x11, #0xb0]\n"
- "ldr q4, [x11, #0xc0]\n"
- ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
- "ldr q5, [x11, #0xd0]\n"
- "ldr q6, [x11, #0xe0]\n"
- ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
- "ldr q7, [x11, #0xf0]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
- "add x11, x11, #0x100\n"
- ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
- "tbnz %x[flags], #31, 11f\n"
+ "bge 12b\n"
+ "cbz x24, 18f\n"
+ "14:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x24, #1, 15f\n"
+ "ldr h0, [x23], #0x2\n"
+ "tbz x24, #0, 16f\n"
+ "ld1 { v0.b }[2], [x23]\n"
+ "b 16f\n"
+ "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x23, #0x0]\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 17f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "11:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "12:" // Height 1: Multiply loop: Main loop skip
- "cbz x27, 19f\n"
- "cmp x27, #0x4\n"
- "blt 15f\n"
- "13:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "tbnz %x[flags], #31, 14f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "14:" // Height 1: Multiply loop: unique 3: skip row sum
- "ldr q8, [x11, #0x0]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x10]\n"
- "ldr q10, [x11, #0x20]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [x11, #0x30]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "sub x27, x27, #0x4\n"
- "add x11, x11, #0x40\n"
- ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
- "cmp x27, #0x4\n"
- "bge 13b\n"
- "cbz x27, 19f\n"
- "15:" // Height 1: Multiply loop: Skip odd blocks
- "tbz x27, #1, 16f\n"
- "ldr h0, [x26], #0x2\n"
- "tbz x27, #0, 17f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "b 17f\n"
- "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "17:" // Height 1: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 18f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- "18:" // Height 1: Multiply loop: unique 4: skip row sum
- "ldr q5, [x11, #0x0]\n"
- ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x11, #0x10]\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
- "ldr q8, [x11, #0x30]\n"
- ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
- "19:" // Height 1: Multiply loop: No odd multiplies
+ "17:" // Height 1: Multiply loop: unique 4: skip row sum
+ "ldr q10, [x28, #0x0]\n"
+ ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x10]\n"
+ "ldr q5, [x28, #0x20]\n"
+ ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
+ "ldr q6, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
+ "18:" // Height 1: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x19\n"
- "bne 5b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "tbnz %x[flags], #31, 20f\n"
+ "add x25, x25, #0x1\n"
+ "cmp x25, x19\n"
+ "bne 4b\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "tbnz %x[flags], #31, 19f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x19, %x[qp], %[b_offset]\n"
+ "add x22, %x[qp], %[b_offset]\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "ld1r { v1.4s }, [x19]\n"
+ "ld1r { v1.4s }, [x22]\n"
"neg v1.4s, v1.4s\n"
"mul v11.4s, v11.4s, v1.4s\n"
- "20:" // Height 1: skip row sum fixup
+ "19:" // Height 1: skip row sum fixup
"add v16.4s, v16.4s, v11.4s\n"
+ "ldr q0, [x27, #0x0]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x27, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q2, [x27, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
- "ldr q3, [x10, #0x30]\n"
+ "ldr q3, [x27, #0x30]\n"
+ "add x27, x27, #0x40\n"
"add v16.4s, v16.4s, v0.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
- "ld1r { v0.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x23]\n"
+ "ld1r { v4.4s }, [x22]\n"
"add v17.4s, v17.4s, v1.4s\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
"add v18.4s, v18.4s, v2.4s\n"
- "ld1r { v4.4s }, [x19]\n"
- "add x10, x10, #0x40\n"
"add v19.4s, v19.4s, v3.4s\n"
"sqrdmulh v16.4s, v16.4s, v4.4s\n"
"sqrdmulh v17.4s, v17.4s, v4.4s\n"
"sqrdmulh v18.4s, v18.4s, v4.4s\n"
"sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "tbz %x[flags], #5, 21f\n"
+ "tbz %x[flags], #5, 20f\n"
"and v4.16b, v16.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v17.16b, v0.16b\n"
@@ -303,18 +294,18 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqadd v17.4s, v17.4s, v5.4s\n"
"sqadd v18.4s, v18.4s, v6.4s\n"
"sqadd v19.4s, v19.4s, v7.4s\n"
- "21:" // Height 1: no shift correction
+ "20:" // Height 1: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x22, %x[qp], %[minval]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x12, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "cmp x9, #0x10\n"
"add v16.4s, v16.4s, v4.4s\n"
"add v17.4s, v17.4s, v4.4s\n"
"add v18.4s, v18.4s, v4.4s\n"
@@ -330,80 +321,69 @@ void a64_hybrid_u8qa_dot_4x16 (
"smax v19.4s, v19.4s, v5.4s\n"
"uzp1 v17.8h, v18.8h, v19.8h\n"
"uzp1 v16.16b, v16.16b, v17.16b\n"
- "bge 30f\n"
- "tbz x12, #3, 25f\n"
- "str d16, [x9], #0x8\n"
- "tbz x12, #2, 23f\n"
- "st1 { v16.s }[2], [x9], #0x4\n"
- "tbz x12, #1, 22f\n"
- "st1 { v16.h }[6], [x9], #0x2\n"
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[14], [x9]\n"
- "b 29f\n"
- "22:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[12], [x9]\n"
- "b 29f\n"
- "23:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x12, #1, 24f\n"
- "st1 { v16.h }[4], [x9], #0x2\n"
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[10], [x9]\n"
- "b 29f\n"
- "24:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[8], [x9]\n"
- "b 29f\n"
- "25:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x12, #2, 27f\n"
- "str s16, [x9], #0x4\n"
- "tbz x12, #1, 26f\n"
- "st1 { v16.h }[2], [x9], #0x2\n"
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[6], [x9]\n"
- "b 29f\n"
- "26:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[4], [x9]\n"
- "b 29f\n"
- "27:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x12, #1, 28f\n"
- "str h16, [x9], #0x2\n"
- "tbz x12, #0, 29f\n"
- "st1 { v16.b }[2], [x9]\n"
- "b 29f\n"
- "28:" // Height 1: Partial direct writeback: partial_1_0
- "str b16, [x9, #0x0]\n"
- "29:" // Height 1: Partial direct writeback: Done
- "b 31f\n"
- "30:" // Height 1: Full writeback
- "str q16, [x9, #0x0]\n"
- "add x9, x9, #0x10\n"
- "31:" // Height 1: Writeback done
- "subs x12, x12, #0x10\n"
- "bgt 3b\n"
- "b 126f\n"
- "32:" // Height 2
+ "bge 29f\n"
+ "tbz x9, #3, 24f\n"
+ "str d16, [x26], #0x8\n"
+ "tbz x9, #2, 22f\n"
+ "st1 { v16.s }[2], [x26], #0x4\n"
+ "tbz x9, #1, 21f\n"
+ "st1 { v16.h }[6], [x26], #0x2\n"
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[14], [x26]\n"
+ "b 28f\n"
+ "21:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[12], [x26]\n"
+ "b 28f\n"
+ "22:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x9, #1, 23f\n"
+ "st1 { v16.h }[4], [x26], #0x2\n"
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[10], [x26]\n"
+ "b 28f\n"
+ "23:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[8], [x26]\n"
+ "b 28f\n"
+ "24:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x9, #2, 26f\n"
+ "str s16, [x26], #0x4\n"
+ "tbz x9, #1, 25f\n"
+ "st1 { v16.h }[2], [x26], #0x2\n"
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[6], [x26]\n"
+ "b 28f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[4], [x26]\n"
+ "b 28f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x9, #1, 27f\n"
+ "str h16, [x26], #0x2\n"
+ "tbz x9, #0, 28f\n"
+ "st1 { v16.b }[2], [x26]\n"
+ "b 28f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_0
+ "str b16, [x26, #0x0]\n"
+ "28:" // Height 1: Partial direct writeback: Done
+ "b 30f\n"
+ "29:" // Height 1: Full writeback
+ "str q16, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "30:" // Height 1: Writeback done
+ "subs x9, x9, #0x10\n"
+ "bgt 2b\n"
+ "b 122f\n"
+ "31:" // Height 2
"movi v11.4s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"movi v12.4s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
- "movi v13.4s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "movi v14.4s, #0x0\n"
"movi v15.16b, #0x1\n"
- "tbz %x[flags], #2, 33f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "add x25, x25, x19\n"
- "b 34f\n"
- "33:" // Height 2: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "34:" // Height 2: Column loop
+ "mov x26, %x[output_ptr]\n"
+ "32:" // Height 2: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
@@ -412,71 +392,135 @@ void a64_hybrid_u8qa_dot_4x16 (
"movi v21.4s, #0x0\n"
"movi v22.4s, #0x0\n"
"movi v23.4s, #0x0\n"
- "35:" // Height 2: setup done
- "mov x28, #0x0\n"
- "36:" // Height 2: String loop
+ "33:" // Height 2: setup done
+ "mov x25, #0x0\n"
+ "34:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 37f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 35f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "cbnz x28, 38f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "cbnz x25, 36f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
- "b 38f\n"
- "37:" // Height 2: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "38:" // Height 2: input setup done
- "cmp x27, #0x10\n"
- "blt 43f\n"
- "cmp x27, #0x20\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 36f\n"
+ "35:" // Height 2: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "36:" // Height 2: input setup done
+ "cmp x24, #0x10\n"
"blt 41f\n"
- "39:" // Height 2: Multiply loop: Main loop head
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q4, [x11, #0x0]\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "ldr q4, [x28, #0x0]\n"
+ "blt 39f\n"
+ "37:" // Height 2: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x10]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x20]\n"
- "ldr q7, [x11, #0x30]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q8, [x11, #0x40]\n"
+ "ldr q7, [x28, #0x30]\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q9, [x11, #0x50]\n"
+ "ldr q8, [x28, #0x40]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x60]\n"
+ "ldr q9, [x28, #0x50]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "ldr q4, [x11, #0x70]\n"
+ "ldr q10, [x28, #0x60]\n"
+ "ldr q4, [x28, #0x70]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x80]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x90]\n"
+ "ldr q5, [x28, #0x80]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
- "ldr q7, [x11, #0xa0]\n"
+ "ldr q6, [x28, #0x90]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
- "ldr q8, [x11, #0xb0]\n"
+ "ldr q7, [x28, #0xa0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
- "add x26, x26, #0x10\n"
+ "ldr q8, [x28, #0xb0]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x28, #0xd0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x28, #0xe0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 38f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "38:" // Height 2: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x20\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "bge 37b\n"
+ "39:" // Height 2: Multiply loop: Single iteration only
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "sub x24, x24, #0x10\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q8, [x28, #0x40]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q9, [x28, #0x50]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr q10, [x28, #0x60]\n"
+ "ldr q4, [x28, #0x70]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q6, [x28, #0x90]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ "ldr q7, [x28, #0xa0]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q8, [x28, #0xb0]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
- "ldr q9, [x11, #0xc0]\n"
+ "ldr q9, [x28, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
- "add x24, x24, #0x10\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x11, #0xd0]\n"
+ "ldr q10, [x28, #0xd0]\n"
".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x11, #0xe0]\n"
+ "ldr q4, [x28, #0xe0]\n"
".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
- "ldr q5, [x11, #0xf0]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
@@ -493,171 +537,109 @@ void a64_hybrid_u8qa_dot_4x16 (
"tbnz %x[flags], #31, 40f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- "40:" // Height 2: Multiply loop: unique 5: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x20\n"
- "bge 39b\n"
- "41:" // Height 2: Multiply loop: Single iteration only
- "sub x27, x27, #0x10\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q6, [x11, #0x0]\n"
+ "40:" // Height 2: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "41:" // Height 2: Multiply loop: Main loop skip
+ "cbz x24, 48f\n"
+ "cmp x24, #0x4\n"
+ "blt 44f\n"
+ "42:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x23], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "tbnz %x[flags], #31, 43f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "43:" // Height 2: Multiply loop: unique 7: skip row sum
+ "ldr q6, [x28, #0x0]\n"
".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x24, x24, #0x4\n"
".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
- "ldr q8, [x11, #0x20]\n"
- "ldr q9, [x11, #0x30]\n"
+ "ldr q8, [x28, #0x20]\n"
+ "cmp x24, #0x4\n"
".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x40]\n"
+ "ldr q9, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
- "ldr q4, [x11, #0x50]\n"
".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x60]\n"
".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x70]\n"
".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x80]\n"
".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [x11, #0x90]\n"
- ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
- "ldr q9, [x11, #0xa0]\n"
- ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n"
- "ldr q10, [x11, #0xb0]\n"
- ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n"
- "ldr q4, [x11, #0xc0]\n"
- ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n"
- "ldr q5, [x11, #0xd0]\n"
- ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x11, #0xe0]\n"
- ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x11, #0xf0]\n"
- ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
- ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n"
- "tbnz %x[flags], #31, 42f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- "42:" // Height 2: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "43:" // Height 2: Multiply loop: Main loop skip
- "cbz x27, 50f\n"
- "cmp x27, #0x4\n"
- "blt 46f\n"
- "44:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x24], #0x4\n"
- "tbnz %x[flags], #31, 45f\n"
+ "bge 42b\n"
+ "cbz x24, 48f\n"
+ "44:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x24, #1, 45f\n"
+ "ldr h0, [x23], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "tbz x24, #0, 46f\n"
+ "ld1 { v0.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "b 46f\n"
+ "45:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x23, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "46:" // Height 2: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 47f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- "45:" // Height 2: Multiply loop: unique 7: skip row sum
- "ldr q8, [x11, #0x0]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q10, [x11, #0x20]\n"
- "ldr q4, [x11, #0x30]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "sub x27, x27, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "cmp x27, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
- "bge 44b\n"
- "cbz x27, 50f\n"
- "46:" // Height 2: Multiply loop: Skip odd blocks
- "tbz x27, #1, 47f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x24], #0x2\n"
- "tbz x27, #0, 48f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x24]\n"
- "b 48f\n"
- "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x24, #0x0]\n"
- "48:" // Height 2: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 49f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- "49:" // Height 2: Multiply loop: unique 8: skip row sum
- "ldr q5, [x11, #0x0]\n"
- ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
- "ldr q7, [x11, #0x20]\n"
- "ldr q8, [x11, #0x30]\n"
- ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n"
- "50:" // Height 2: Multiply loop: No odd multiplies
+ "47:" // Height 2: Multiply loop: unique 8: skip row sum
+ "ldr q10, [x28, #0x0]\n"
+ ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x10]\n"
+ ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ "ldr q6, [x28, #0x30]\n"
+ ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
+ "add x28, x28, #0x40\n"
+ ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
+ "48:" // Height 2: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x19\n"
- "bne 36b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "tbnz %x[flags], #31, 51f\n"
+ "add x25, x25, #0x1\n"
+ "cmp x25, x19\n"
+ "bne 34b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x21, x26, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 49f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x19, %x[qp], %[b_offset]\n"
- "ld1r { v2.4s }, [x19]\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x22]\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
"addp v12.4s, v12.4s, v12.4s\n"
"neg v2.4s, v2.4s\n"
"mul v11.4s, v11.4s, v2.4s\n"
"mul v12.4s, v12.4s, v2.4s\n"
- "51:" // Height 2: skip row sum fixup
+ "49:" // Height 2: skip row sum fixup
"add v16.4s, v16.4s, v11.4s\n"
+ "ldr q0, [x27, #0x0]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x27, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q2, [x27, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x27, #0x30]\n"
+ "add x27, x27, #0x40\n"
"add v20.4s, v20.4s, v12.4s\n"
- "ldr q3, [x10, #0x30]\n"
+ "ld1r { v4.4s }, [x22]\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v22.4s, v22.4s, v12.4s\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add x10, x10, #0x40\n"
"add v16.4s, v16.4s, v0.4s\n"
"add v17.4s, v17.4s, v1.4s\n"
"add v18.4s, v18.4s, v2.4s\n"
"add v19.4s, v19.4s, v3.4s\n"
"add v20.4s, v20.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x23]\n"
"add v21.4s, v21.4s, v1.4s\n"
"add v22.4s, v22.4s, v2.4s\n"
"add v23.4s, v23.4s, v3.4s\n"
@@ -669,7 +651,7 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqrdmulh v21.4s, v21.4s, v4.4s\n"
"sqrdmulh v22.4s, v22.4s, v4.4s\n"
"sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "tbz %x[flags], #5, 52f\n"
+ "tbz %x[flags], #5, 50f\n"
"and v4.16b, v16.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v17.16b, v0.16b\n"
@@ -694,18 +676,18 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqadd v21.4s, v21.4s, v9.4s\n"
"sqadd v22.4s, v22.4s, v10.4s\n"
"sqadd v23.4s, v23.4s, v4.4s\n"
- "52:" // Height 2: no shift correction
+ "50:" // Height 2: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x22, %x[qp], %[minval]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x12, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "cmp x9, #0x10\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
@@ -740,100 +722,86 @@ void a64_hybrid_u8qa_dot_4x16 (
"uzp1 v16.16b, v16.16b, v17.16b\n"
"uzp1 v21.8h, v22.8h, v23.8h\n"
"uzp1 v20.16b, v20.16b, v21.16b\n"
- "bge 61f\n"
- "tbz x12, #3, 56f\n"
- "str d16, [x9], #0x8\n"
- "str d20, [x25], #0x8\n"
- "tbz x12, #2, 54f\n"
- "st1 { v16.s }[2], [x9], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "tbz x12, #1, 53f\n"
- "st1 { v16.h }[6], [x9], #0x2\n"
- "st1 { v20.h }[6], [x25], #0x2\n"
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[14], [x9]\n"
- "st1 { v20.b }[14], [x25]\n"
- "b 60f\n"
- "53:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[12], [x9]\n"
- "st1 { v20.b }[12], [x25]\n"
- "b 60f\n"
- "54:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x12, #1, 55f\n"
- "st1 { v16.h }[4], [x9], #0x2\n"
- "st1 { v20.h }[4], [x25], #0x2\n"
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[10], [x9]\n"
- "st1 { v20.b }[10], [x25]\n"
- "b 60f\n"
- "55:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[8], [x9]\n"
- "st1 { v20.b }[8], [x25]\n"
- "b 60f\n"
- "56:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x12, #2, 58f\n"
- "str s16, [x9], #0x4\n"
- "str s20, [x25], #0x4\n"
- "tbz x12, #1, 57f\n"
- "st1 { v16.h }[2], [x9], #0x2\n"
- "st1 { v20.h }[2], [x25], #0x2\n"
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[6], [x9]\n"
- "st1 { v20.b }[6], [x25]\n"
- "b 60f\n"
- "57:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[4], [x9]\n"
- "st1 { v20.b }[4], [x25]\n"
+ "bge 59f\n"
+ "tbz x9, #3, 54f\n"
+ "str d16, [x26], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x9, #2, 52f\n"
+ "st1 { v16.s }[2], [x26], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "tbz x9, #1, 51f\n"
+ "st1 { v16.h }[6], [x26], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[14], [x26]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "b 58f\n"
+ "51:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[12], [x26]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "b 58f\n"
+ "52:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x9, #1, 53f\n"
+ "st1 { v16.h }[4], [x26], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[10], [x26]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "b 58f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[8], [x26]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "b 58f\n"
+ "54:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x9, #2, 56f\n"
+ "str s16, [x26], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "tbz x9, #1, 55f\n"
+ "st1 { v16.h }[2], [x26], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[6], [x26]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "b 58f\n"
+ "55:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[4], [x26]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "b 58f\n"
+ "56:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x9, #1, 57f\n"
+ "str h16, [x26], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "tbz x9, #0, 58f\n"
+ "st1 { v16.b }[2], [x26]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "b 58f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_0
+ "str b16, [x26, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "58:" // Height 2: Partial direct writeback: Done
"b 60f\n"
- "58:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x12, #1, 59f\n"
- "str h16, [x9], #0x2\n"
- "str h20, [x25], #0x2\n"
- "tbz x12, #0, 60f\n"
- "st1 { v16.b }[2], [x9]\n"
- "st1 { v20.b }[2], [x25]\n"
- "b 60f\n"
- "59:" // Height 2: Partial direct writeback: partial_1_0
- "str b16, [x9, #0x0]\n"
- "str b20, [x25, #0x0]\n"
- "60:" // Height 2: Partial direct writeback: Done
- "b 62f\n"
- "61:" // Height 2: Full writeback
- "str q16, [x9, #0x0]\n"
- "str q20, [x25, #0x0]\n"
- "add x9, x9, #0x10\n"
- "add x25, x25, #0x10\n"
- "62:" // Height 2: Writeback done
- "subs x12, x12, #0x10\n"
- "bgt 34b\n"
- "b 126f\n"
- "63:" // Height 3
+ "59:" // Height 2: Full writeback
+ "str q16, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "60:" // Height 2: Writeback done
+ "subs x9, x9, #0x10\n"
+ "bgt 32b\n"
+ "b 122f\n"
+ "61:" // Height 3
"movi v11.4s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"movi v12.4s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "movi v14.4s, #0x0\n"
+ "mov x26, %x[output_ptr]\n"
"movi v15.16b, #0x1\n"
- "tbz %x[flags], #2, 64f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "ldr x23, [%x[output_ptr], #0x10]\n"
- "add x25, x25, x19\n"
- "add x23, x23, x19\n"
- "b 65f\n"
- "64:" // Height 3: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "add x23, x25, x19\n"
- "65:" // Height 3: Column loop
+ "62:" // Height 3: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
@@ -846,85 +814,169 @@ void a64_hybrid_u8qa_dot_4x16 (
"movi v25.4s, #0x0\n"
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- "66:" // Height 3: setup done
- "mov x28, #0x0\n"
- "67:" // Height 3: String loop
+ "63:" // Height 3: setup done
+ "mov x25, #0x0\n"
+ "64:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 68f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 65f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "cbnz x28, 69f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
+ "cbnz x25, 66f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 69f\n"
- "68:" // Height 3: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "69:" // Height 3: input setup done
- "cmp x27, #0x10\n"
- "blt 74f\n"
- "cmp x27, #0x20\n"
- "blt 72f\n"
- "70:" // Height 3: Multiply loop: Main loop head
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q4, [x11, #0x0]\n"
+ "add x21, x21, x19\n"
+ "b 66f\n"
+ "65:" // Height 3: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "66:" // Height 3: input setup done
+ "cmp x24, #0x10\n"
+ "blt 71f\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "blt 69f\n"
+ "67:" // Height 3: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x10]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q7, [x11, #0x30]\n"
- "ldr q8, [x11, #0x40]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x21, x21, #0x10\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x50]\n"
+ "ldr q8, [x28, #0x40]\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q10, [x11, #0x60]\n"
+ "ldr q9, [x28, #0x50]\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "ldr q4, [x11, #0x70]\n"
+ "ldr q10, [x28, #0x60]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x80]\n"
+ "ldr q4, [x28, #0x70]\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "ldr q5, [x28, #0x80]\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x11, #0x90]\n"
+ "ldr q6, [x28, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x28, #0xa0]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ "ldr q8, [x28, #0xb0]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x28, #0xd0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x28, #0xe0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 68f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "68:" // Height 3: Multiply loop: unique 9: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x20\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q2, [x21, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "bge 67b\n"
+ "69:" // Height 3: Multiply loop: Single iteration only
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "sub x24, x24, #0x10\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x28, #0x30]\n"
"add x22, x22, #0x10\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q8, [x28, #0x40]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q9, [x28, #0x50]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ "ldr q10, [x28, #0x60]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x70]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x28, #0x90]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x11, #0xa0]\n"
+ "ldr q7, [x28, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
- "ldr q8, [x11, #0xb0]\n"
+ "ldr q8, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
- "ldr q9, [x11, #0xc0]\n"
+ "ldr q9, [x28, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x11, #0xd0]\n"
+ "ldr q10, [x28, #0xd0]\n"
".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x11, #0xe0]\n"
+ "ldr q4, [x28, #0xe0]\n"
".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
- "ldr q5, [x11, #0xf0]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
@@ -945,181 +997,100 @@ void a64_hybrid_u8qa_dot_4x16 (
".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
- "tbnz %x[flags], #31, 71f\n"
+ "tbnz %x[flags], #31, 70f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "71:" // Height 3: Multiply loop: unique 9: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x20\n"
+ "70:" // Height 3: Multiply loop: unique 10: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "bge 70b\n"
- "72:" // Height 3: Multiply loop: Single iteration only
- "sub x27, x27, #0x10\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
- "ldr q6, [x11, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "71:" // Height 3: Multiply loop: Main loop skip
+ "cbz x24, 78f\n"
+ "cmp x24, #0x4\n"
+ "blt 74f\n"
+ "72:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x23], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "tbnz %x[flags], #31, 73f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "73:" // Height 3: Multiply loop: unique 11: skip row sum
+ "ldr q6, [x28, #0x0]\n"
".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x24, x24, #0x4\n"
".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
- "ldr q8, [x11, #0x20]\n"
+ "ldr q8, [x28, #0x20]\n"
+ "cmp x24, #0x4\n"
".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
- "ldr q9, [x11, #0x30]\n"
- "ldr q10, [x11, #0x40]\n"
+ "ldr q9, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- "ldr q4, [x11, #0x50]\n"
".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
- "ldr q5, [x11, #0x60]\n"
".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
- "ldr q6, [x11, #0x70]\n"
".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x80]\n"
".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
- "ldr q8, [x11, #0x90]\n"
".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [x11, #0xa0]\n"
- ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n"
- ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n"
- "ldr q10, [x11, #0xb0]\n"
- ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n"
- "ldr q4, [x11, #0xc0]\n"
- ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n"
- "ldr q5, [x11, #0xd0]\n"
- ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x11, #0xe0]\n"
- ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x11, #0xf0]\n"
- ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
- ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
- "tbnz %x[flags], #31, 73f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "73:" // Height 3: Multiply loop: unique 10: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "74:" // Height 3: Multiply loop: Main loop skip
- "cbz x27, 81f\n"
- "cmp x27, #0x4\n"
- "blt 77f\n"
- "75:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x24], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "tbnz %x[flags], #31, 76f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "76:" // Height 3: Multiply loop: unique 11: skip row sum
- "ldr q8, [x11, #0x0]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q10, [x11, #0x20]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q4, [x11, #0x30]\n"
- "sub x27, x27, #0x4\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "cmp x27, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
- "bge 75b\n"
- "cbz x27, 81f\n"
- "77:" // Height 3: Multiply loop: Skip odd blocks
- "tbz x27, #1, 78f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x24], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "tbz x27, #0, 79f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x24]\n"
- "ld1 { v2.b }[2], [x22]\n"
- "b 79f\n"
- "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x24, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
- "79:" // Height 3: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 80f\n"
+ "bge 72b\n"
+ "cbz x24, 78f\n"
+ "74:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x24, #1, 75f\n"
+ "ldr h0, [x23], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
+ "tbz x24, #0, 76f\n"
+ "ld1 { v0.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v2.b }[2], [x21]\n"
+ "b 76f\n"
+ "75:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x23, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "ldr b2, [x21, #0x0]\n"
+ "76:" // Height 3: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 77f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- "80:" // Height 3: Multiply loop: unique 12: skip row sum
- "ldr q5, [x11, #0x0]\n"
- ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n"
- "ldr q8, [x11, #0x30]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n"
- "81:" // Height 3: Multiply loop: No odd multiplies
+ "77:" // Height 3: Multiply loop: unique 12: skip row sum
+ "ldr q10, [x28, #0x0]\n"
+ ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x10]\n"
+ ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n"
+ "ldr q6, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n"
+ "78:" // Height 3: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x19\n"
- "bne 67b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
- "tbnz %x[flags], #31, 82f\n"
+ "add x25, x25, #0x1\n"
+ "cmp x25, x19\n"
+ "bne 64b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x21, x26, x19\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "tbnz %x[flags], #31, 79f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x19, %x[qp], %[b_offset]\n"
- "ld1r { v3.4s }, [x19]\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v3.4s }, [x22]\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v11.4s, v11.4s, v11.4s\n"
@@ -1129,24 +1100,24 @@ void a64_hybrid_u8qa_dot_4x16 (
"mul v11.4s, v11.4s, v3.4s\n"
"mul v12.4s, v12.4s, v3.4s\n"
"mul v13.4s, v13.4s, v3.4s\n"
- "82:" // Height 3: skip row sum fixup
+ "79:" // Height 3: skip row sum fixup
"add v16.4s, v16.4s, v11.4s\n"
+ "ldr q0, [x27, #0x0]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x27, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q2, [x27, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x27, #0x30]\n"
+ "add x27, x27, #0x40\n"
"add v20.4s, v20.4s, v12.4s\n"
- "ldr q3, [x10, #0x30]\n"
+ "ld1r { v4.4s }, [x22]\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v22.4s, v22.4s, v12.4s\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add x10, x10, #0x40\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add v26.4s, v26.4s, v13.4s\n"
@@ -1160,7 +1131,7 @@ void a64_hybrid_u8qa_dot_4x16 (
"add v22.4s, v22.4s, v2.4s\n"
"add v23.4s, v23.4s, v3.4s\n"
"add v24.4s, v24.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x23]\n"
"add v25.4s, v25.4s, v1.4s\n"
"add v26.4s, v26.4s, v2.4s\n"
"add v27.4s, v27.4s, v3.4s\n"
@@ -1176,7 +1147,7 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqrdmulh v25.4s, v25.4s, v4.4s\n"
"sqrdmulh v26.4s, v26.4s, v4.4s\n"
"sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "tbz %x[flags], #5, 83f\n"
+ "tbz %x[flags], #5, 80f\n"
"and v4.16b, v16.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v17.16b, v0.16b\n"
@@ -1213,18 +1184,18 @@ void a64_hybrid_u8qa_dot_4x16 (
"sshr v8.4s, v8.4s, #0x1f\n"
"sqadd v26.4s, v26.4s, v7.4s\n"
"sqadd v27.4s, v27.4s, v8.4s\n"
- "83:" // Height 3: no shift correction
+ "80:" // Height 3: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x22, %x[qp], %[minval]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x12, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "cmp x9, #0x10\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
@@ -1278,122 +1249,106 @@ void a64_hybrid_u8qa_dot_4x16 (
"uzp1 v16.16b, v16.16b, v17.16b\n"
"uzp1 v20.16b, v20.16b, v21.16b\n"
"uzp1 v24.16b, v24.16b, v25.16b\n"
- "bge 92f\n"
- "tbz x12, #3, 87f\n"
- "str d16, [x9], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x12, #2, 85f\n"
- "st1 { v16.s }[2], [x9], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "tbz x12, #1, 84f\n"
- "st1 { v16.h }[6], [x9], #0x2\n"
- "st1 { v20.h }[6], [x25], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[14], [x9]\n"
- "st1 { v20.b }[14], [x25]\n"
- "st1 { v24.b }[14], [x23]\n"
- "b 91f\n"
- "84:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[12], [x9]\n"
- "st1 { v20.b }[12], [x25]\n"
- "st1 { v24.b }[12], [x23]\n"
- "b 91f\n"
- "85:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x12, #1, 86f\n"
- "st1 { v16.h }[4], [x9], #0x2\n"
- "st1 { v20.h }[4], [x25], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[10], [x9]\n"
- "st1 { v20.b }[10], [x25]\n"
- "st1 { v24.b }[10], [x23]\n"
- "b 91f\n"
- "86:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[8], [x9]\n"
- "st1 { v20.b }[8], [x25]\n"
- "st1 { v24.b }[8], [x23]\n"
- "b 91f\n"
- "87:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x12, #2, 89f\n"
- "str s16, [x9], #0x4\n"
- "str s20, [x25], #0x4\n"
- "str s24, [x23], #0x4\n"
- "tbz x12, #1, 88f\n"
- "st1 { v16.h }[2], [x9], #0x2\n"
- "st1 { v20.h }[2], [x25], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[6], [x9]\n"
- "st1 { v20.b }[6], [x25]\n"
- "st1 { v24.b }[6], [x23]\n"
- "b 91f\n"
- "88:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[4], [x9]\n"
- "st1 { v20.b }[4], [x25]\n"
- "st1 { v24.b }[4], [x23]\n"
- "b 91f\n"
- "89:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x12, #1, 90f\n"
- "str h16, [x9], #0x2\n"
- "str h20, [x25], #0x2\n"
- "str h24, [x23], #0x2\n"
- "tbz x12, #0, 91f\n"
- "st1 { v16.b }[2], [x9]\n"
- "st1 { v20.b }[2], [x25]\n"
- "st1 { v24.b }[2], [x23]\n"
- "b 91f\n"
- "90:" // Height 3: Partial direct writeback: partial_1_0
- "str b16, [x9, #0x0]\n"
- "str b20, [x25, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "91:" // Height 3: Partial direct writeback: Done
- "b 93f\n"
- "92:" // Height 3: Full writeback
- "str q16, [x9, #0x0]\n"
- "str q20, [x25, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "add x9, x9, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x23, x23, #0x10\n"
- "93:" // Height 3: Writeback done
- "subs x12, x12, #0x10\n"
- "bgt 65b\n"
- "b 126f\n"
- "94:" // Height 4
+ "bge 89f\n"
+ "tbz x9, #3, 84f\n"
+ "str d16, [x26], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x9, #2, 82f\n"
+ "st1 { v16.s }[2], [x26], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "tbz x9, #1, 81f\n"
+ "st1 { v16.h }[6], [x26], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[14], [x26]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "b 88f\n"
+ "81:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[12], [x26]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "b 88f\n"
+ "82:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x9, #1, 83f\n"
+ "st1 { v16.h }[4], [x26], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[10], [x26]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "b 88f\n"
+ "83:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[8], [x26]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "b 88f\n"
+ "84:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x9, #2, 86f\n"
+ "str s16, [x26], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "tbz x9, #1, 85f\n"
+ "st1 { v16.h }[2], [x26], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[6], [x26]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "b 88f\n"
+ "85:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[4], [x26]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "b 88f\n"
+ "86:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x9, #1, 87f\n"
+ "str h16, [x26], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "tbz x9, #0, 88f\n"
+ "st1 { v16.b }[2], [x26]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "b 88f\n"
+ "87:" // Height 3: Partial direct writeback: partial_1_0
+ "str b16, [x26, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "88:" // Height 3: Partial direct writeback: Done
+ "b 90f\n"
+ "89:" // Height 3: Full writeback
+ "str q16, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "90:" // Height 3: Writeback done
+ "subs x9, x9, #0x10\n"
+ "bgt 62b\n"
+ "b 122f\n"
+ "91:" // Height 4
"movi v11.4s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"movi v12.4s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"movi v13.4s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x26, %x[output_ptr]\n"
"movi v14.4s, #0x0\n"
+ "mov x19, #0x4\n"
"movi v15.16b, #0x1\n"
- "tbz %x[flags], #2, 95f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "ldr x23, [%x[output_ptr], #0x10]\n"
- "ldr x21, [%x[output_ptr], #0x18]\n"
- "add x25, x25, x19\n"
- "add %x[output_ptr], %x[output_ptr], #0x20\n"
- "add x23, x23, x19\n"
- "add x21, x21, x19\n"
- "b 96f\n"
- "95:" // Height 4: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "add x23, x25, x19\n"
- "add x21, x23, x19\n"
- "add %x[output_ptr], x21, x19\n"
- "96:" // Height 4: Column loop
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "92:" // Height 4: Column loop
"movi v16.4s, #0x0\n"
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
@@ -1410,99 +1365,99 @@ void a64_hybrid_u8qa_dot_4x16 (
"movi v29.4s, #0x0\n"
"movi v30.4s, #0x0\n"
"movi v31.4s, #0x0\n"
- "97:" // Height 4: setup done
- "mov x28, #0x0\n"
- "98:" // Height 4: String loop
+ "93:" // Height 4: setup done
+ "mov x25, #0x0\n"
+ "94:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 99f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 95f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
"ldr x20, [x20, #0x18]\n"
- "cbnz x28, 100f\n"
+ "cbnz x25, 96f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 100f\n"
- "99:" // Height 4: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "100:" // Height 4: input setup done
- "cmp x27, #0x10\n"
- "blt 105f\n"
- "cmp x27, #0x20\n"
- "blt 103f\n"
- "101:" // Height 4: Multiply loop: Main loop head
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
+ "b 96f\n"
+ "95:" // Height 4: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "96:" // Height 4: input setup done
+ "cmp x24, #0x10\n"
+ "blt 101f\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "cmp x24, #0x20\n"
+ "ldr q2, [x21, #0x0]\n"
"ldr q3, [x20, #0x0]\n"
- "ldr q4, [x11, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "blt 99f\n"
+ "97:" // Height 4: Multiply loop: Main loop head
".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x10]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x20]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
- "ldr q7, [x11, #0x30]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x21, x21, #0x10\n"
".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
- "ldr q8, [x11, #0x40]\n"
- "ldr q9, [x11, #0x50]\n"
+ "ldr q8, [x28, #0x40]\n"
+ "add x20, x20, #0x10\n"
".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
- "ldr q10, [x11, #0x60]\n"
+ "ldr q9, [x28, #0x50]\n"
".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "ldr q4, [x11, #0x70]\n"
+ "ldr q10, [x28, #0x60]\n"
".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "ldr q4, [x28, #0x70]\n"
".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
- "ldr q5, [x11, #0x80]\n"
+ "ldr q5, [x28, #0x80]\n"
".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
- "add x20, x20, #0x10\n"
".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x11, #0x90]\n"
+ "ldr q6, [x28, #0x90]\n"
".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x11, #0xa0]\n"
+ "ldr q7, [x28, #0xa0]\n"
".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
- "ldr q8, [x11, #0xb0]\n"
+ "ldr q8, [x28, #0xb0]\n"
".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
- "ldr q9, [x11, #0xc0]\n"
+ "ldr q9, [x28, #0xc0]\n"
".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x11, #0xd0]\n"
+ "ldr q10, [x28, #0xd0]\n"
".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x11, #0xe0]\n"
+ "ldr q4, [x28, #0xe0]\n"
".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
- "ldr q5, [x11, #0xf0]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
@@ -1530,218 +1485,222 @@ void a64_hybrid_u8qa_dot_4x16 (
".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n"
- "tbnz %x[flags], #31, 102f\n"
+ "tbnz %x[flags], #31, 98f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
- "102:" // Height 4: Multiply loop: unique 13: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x20\n"
+ "98:" // Height 4: Multiply loop: unique 13: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x20\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "bge 101b\n"
- "103:" // Height 4: Multiply loop: Single iteration only
- "sub x27, x27, #0x10\n"
- "ldr q0, [x26, #0x0]\n"
- "ldr q1, [x24, #0x0]\n"
- "ldr q2, [x22, #0x0]\n"
+ "ldr q0, [x23, #0x0]\n"
+ "ldr q1, [x22, #0x0]\n"
+ "ldr q2, [x21, #0x0]\n"
"ldr q3, [x20, #0x0]\n"
- "ldr q6, [x11, #0x0]\n"
+ "ldr q4, [x28, #0x0]\n"
+ "bge 97b\n"
+ "99:" // Height 4: Multiply loop: Single iteration only
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x28, #0x10]\n"
+ "sub x24, x24, #0x10\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x28, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x28, #0x30]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr q8, [x28, #0x40]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q9, [x28, #0x50]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q10, [x28, #0x60]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ "ldr q4, [x28, #0x70]\n"
+ ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
+ "ldr q5, [x28, #0x80]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x28, #0x90]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x28, #0xa0]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
+ "ldr q8, [x28, #0xb0]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
+ "ldr q9, [x28, #0xc0]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x28, #0xd0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x28, #0xe0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
+ "ldr q5, [x28, #0xf0]\n"
+ "add x28, x28, #0x100\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8fe // udot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e93c // udot v28.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e95d // udot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e89e // udot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 100f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "100:" // Height 4: Multiply loop: unique 14: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "101:" // Height 4: Multiply loop: Main loop skip
+ "cbz x24, 108f\n"
+ "cmp x24, #0x4\n"
+ "blt 104f\n"
+ "102:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x23], #0x4\n"
+ "ldr s1, [x22], #0x4\n"
+ "ldr s2, [x21], #0x4\n"
+ "ldr s3, [x20], #0x4\n"
+ "tbnz %x[flags], #31, 103f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "103:" // Height 4: Multiply loop: unique 15: skip row sum
+ "ldr q6, [x28, #0x0]\n"
".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x11, #0x10]\n"
+ "ldr q7, [x28, #0x10]\n"
+ "sub x24, x24, #0x4\n"
".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
- "ldr q8, [x11, #0x20]\n"
+ "ldr q8, [x28, #0x20]\n"
+ "cmp x24, #0x4\n"
".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
- "ldr q9, [x11, #0x30]\n"
+ "ldr q9, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
- "ldr q10, [x11, #0x40]\n"
- "ldr q4, [x11, #0x50]\n"
".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
- "ldr q5, [x11, #0x60]\n"
".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
- "ldr q6, [x11, #0x70]\n"
".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
- "add x26, x26, #0x10\n"
".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x11, #0x80]\n"
".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
- "add x20, x20, #0x10\n"
".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
- "ldr q8, [x11, #0x90]\n"
".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [x11, #0xa0]\n"
- ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
- ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n"
- ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n"
- ".inst 0x6fa3e15c // udot v28.4s, v10.16b, v3.4b[1]\n"
- "ldr q10, [x11, #0xb0]\n"
- ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n"
- ".inst 0x6fa3e09d // udot v29.4s, v4.16b, v3.4b[1]\n"
- "ldr q4, [x11, #0xc0]\n"
- ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0be // udot v30.4s, v5.16b, v3.4b[1]\n"
- "ldr q5, [x11, #0xd0]\n"
- ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6fa3e0df // udot v31.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x11, #0xe0]\n"
- ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n"
- ".inst 0x6f83e8fc // udot v28.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x11, #0xf0]\n"
- ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
- "add x11, x11, #0x100\n"
- ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e91d // udot v29.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f83e93e // udot v30.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f83e95f // udot v31.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x6fa3e89c // udot v28.4s, v4.16b, v3.4b[3]\n"
- ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8bd // udot v29.4s, v5.16b, v3.4b[3]\n"
- ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8de // udot v30.4s, v6.16b, v3.4b[3]\n"
- ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
- ".inst 0x6fa3e8ff // udot v31.4s, v7.16b, v3.4b[3]\n"
- "tbnz %x[flags], #31, 104f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
- "104:" // Height 4: Multiply loop: unique 14: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "105:" // Height 4: Multiply loop: Main loop skip
- "cbz x27, 112f\n"
- "cmp x27, #0x4\n"
- "blt 108f\n"
- "106:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x26], #0x4\n"
- "ldr s1, [x24], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s3, [x20], #0x4\n"
- "tbnz %x[flags], #31, 107f\n"
- ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
- ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
- ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
- ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
- "107:" // Height 4: Multiply loop: unique 15: skip row sum
- "ldr q8, [x11, #0x0]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q9, [x11, #0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q10, [x11, #0x20]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q4, [x11, #0x30]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "sub x27, x27, #0x4\n"
- "add x11, x11, #0x40\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "cmp x27, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n"
- "bge 106b\n"
- "cbz x27, 112f\n"
- "108:" // Height 4: Multiply loop: Skip odd blocks
- "tbz x27, #1, 109f\n"
- "ldr h0, [x26], #0x2\n"
- "ldr h1, [x24], #0x2\n"
- "ldr h2, [x22], #0x2\n"
+ "bge 102b\n"
+ "cbz x24, 108f\n"
+ "104:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x24, #1, 105f\n"
+ "ldr h0, [x23], #0x2\n"
+ "ldr h1, [x22], #0x2\n"
+ "ldr h2, [x21], #0x2\n"
"ldr h3, [x20], #0x2\n"
- "tbz x27, #0, 110f\n"
- "ld1 { v0.b }[2], [x26]\n"
- "ld1 { v1.b }[2], [x24]\n"
- "ld1 { v2.b }[2], [x22]\n"
+ "tbz x24, #0, 106f\n"
+ "ld1 { v0.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v2.b }[2], [x21]\n"
"ld1 { v3.b }[2], [x20]\n"
- "b 110f\n"
- "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x26, #0x0]\n"
- "ldr b1, [x24, #0x0]\n"
- "ldr b2, [x22, #0x0]\n"
+ "b 106f\n"
+ "105:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x23, #0x0]\n"
+ "ldr b1, [x22, #0x0]\n"
+ "ldr b2, [x21, #0x0]\n"
"ldr b3, [x20, #0x0]\n"
- "110:" // Height 4: Multiply loop: Ragged operand read: Done
- "tbnz %x[flags], #31, 111f\n"
+ "106:" // Height 4: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 107f\n"
".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
- "111:" // Height 4: Multiply loop: unique 16: skip row sum
- "ldr q5, [x11, #0x0]\n"
- ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
- "ldr q6, [x11, #0x10]\n"
- ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
- "ldr q7, [x11, #0x20]\n"
- ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n"
- "ldr q8, [x11, #0x30]\n"
- ".inst 0x6f83e0bc // udot v28.4s, v5.16b, v3.4b[0]\n"
- "add x11, x11, #0x40\n"
- ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n"
- ".inst 0x6f83e0fe // udot v30.4s, v7.16b, v3.4b[0]\n"
- ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f83e11f // udot v31.4s, v8.16b, v3.4b[0]\n"
- "112:" // Height 4: Multiply loop: No odd multiplies
+ "107:" // Height 4: Multiply loop: unique 16: skip row sum
+ "ldr q10, [x28, #0x0]\n"
+ ".inst 0x6f80e150 // udot v16.4s, v10.16b, v0.4b[0]\n"
+ "ldr q4, [x28, #0x10]\n"
+ ".inst 0x6f81e154 // udot v20.4s, v10.16b, v1.4b[0]\n"
+ "ldr q5, [x28, #0x20]\n"
+ ".inst 0x6f82e158 // udot v24.4s, v10.16b, v2.4b[0]\n"
+ "ldr q6, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f80e091 // udot v17.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e099 // udot v25.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09d // udot v29.4s, v4.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0b2 // udot v18.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0ba // udot v26.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0be // udot v30.4s, v5.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0d3 // udot v19.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d7 // udot v23.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0db // udot v27.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0df // udot v31.4s, v6.16b, v3.4b[0]\n"
+ "108:" // Height 4: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x28, x28, #0x1\n"
- "cmp x28, x19\n"
- "bne 98b\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "prfm pstl1keep, [x23, #0x0]\n"
+ "add x25, x25, #0x1\n"
+ "cmp x25, x19\n"
+ "bne 94b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x26, #0x0]\n"
+ "add x21, x26, x19\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "tbnz %x[flags], #31, 113f\n"
+ "add x20, x21, x19\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
+ "tbnz %x[flags], #31, 109f\n"
"addp v11.4s, v11.4s, v11.4s\n"
- "add x19, %x[qp], %[b_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[b_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"addp v12.4s, v12.4s, v12.4s\n"
"addp v13.4s, v13.4s, v13.4s\n"
"addp v14.4s, v14.4s, v14.4s\n"
@@ -1754,24 +1713,24 @@ void a64_hybrid_u8qa_dot_4x16 (
"mul v12.4s, v12.4s, v4.4s\n"
"mul v13.4s, v13.4s, v4.4s\n"
"mul v14.4s, v14.4s, v4.4s\n"
- "113:" // Height 4: skip row sum fixup
+ "109:" // Height 4: skip row sum fixup
"add v16.4s, v16.4s, v11.4s\n"
+ "ldr q0, [x27, #0x0]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add v17.4s, v17.4s, v11.4s\n"
- "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x27, #0x10]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add v18.4s, v18.4s, v11.4s\n"
- "ldr q1, [x10, #0x10]\n"
+ "ldr q2, [x27, #0x20]\n"
+ "add x22, %x[qp], %[per_layer_mul]\n"
"add v19.4s, v19.4s, v11.4s\n"
- "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x27, #0x30]\n"
+ "add x27, x27, #0x40\n"
"add v20.4s, v20.4s, v12.4s\n"
- "ldr q3, [x10, #0x30]\n"
+ "ld1r { v4.4s }, [x22]\n"
"add v21.4s, v21.4s, v12.4s\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
"add v22.4s, v22.4s, v12.4s\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
"add v23.4s, v23.4s, v12.4s\n"
- "add x10, x10, #0x40\n"
"add v24.4s, v24.4s, v13.4s\n"
"add v25.4s, v25.4s, v13.4s\n"
"add v26.4s, v26.4s, v13.4s\n"
@@ -1793,7 +1752,7 @@ void a64_hybrid_u8qa_dot_4x16 (
"add v26.4s, v26.4s, v2.4s\n"
"add v27.4s, v27.4s, v3.4s\n"
"add v28.4s, v28.4s, v0.4s\n"
- "ld1r { v0.4s }, [x20]\n"
+ "ld1r { v0.4s }, [x23]\n"
"add v29.4s, v29.4s, v1.4s\n"
"add v30.4s, v30.4s, v2.4s\n"
"add v31.4s, v31.4s, v3.4s\n"
@@ -1813,7 +1772,7 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqrdmulh v29.4s, v29.4s, v4.4s\n"
"sqrdmulh v30.4s, v30.4s, v4.4s\n"
"sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "tbz %x[flags], #5, 114f\n"
+ "tbz %x[flags], #5, 110f\n"
"and v4.16b, v16.16b, v0.16b\n"
"sshr v4.4s, v4.4s, #0x1f\n"
"and v5.16b, v17.16b, v0.16b\n"
@@ -1862,18 +1821,18 @@ void a64_hybrid_u8qa_dot_4x16 (
"sqadd v29.4s, v29.4s, v10.4s\n"
"sqadd v30.4s, v30.4s, v4.4s\n"
"sqadd v31.4s, v31.4s, v5.4s\n"
- "114:" // Height 4: no shift correction
+ "110:" // Height 4: no shift correction
"srshl v16.4s, v16.4s, v0.4s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1r { v4.4s }, [x19]\n"
+ "add x22, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x22]\n"
"srshl v17.4s, v17.4s, v0.4s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x22, %x[qp], %[minval]\n"
"srshl v18.4s, v18.4s, v0.4s\n"
- "ld1r { v5.4s }, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1r { v5.4s }, [x22]\n"
+ "add x22, %x[qp], %[maxval]\n"
"srshl v19.4s, v19.4s, v0.4s\n"
- "ld1r { v6.4s }, [x19]\n"
- "cmp x12, #0x10\n"
+ "ld1r { v6.4s }, [x22]\n"
+ "cmp x9, #0x10\n"
"srshl v20.4s, v20.4s, v0.4s\n"
"srshl v21.4s, v21.4s, v0.4s\n"
"srshl v22.4s, v22.4s, v0.4s\n"
@@ -1946,125 +1905,122 @@ void a64_hybrid_u8qa_dot_4x16 (
"uzp1 v20.16b, v20.16b, v21.16b\n"
"uzp1 v24.16b, v24.16b, v25.16b\n"
"uzp1 v28.16b, v28.16b, v29.16b\n"
- "bge 123f\n"
- "tbz x12, #3, 118f\n"
- "str d16, [x9], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x21], #0x8\n"
- "tbz x12, #2, 116f\n"
- "st1 { v16.s }[2], [x9], #0x4\n"
- "st1 { v20.s }[2], [x25], #0x4\n"
- "st1 { v24.s }[2], [x23], #0x4\n"
- "st1 { v28.s }[2], [x21], #0x4\n"
- "tbz x12, #1, 115f\n"
- "st1 { v16.h }[6], [x9], #0x2\n"
- "st1 { v20.h }[6], [x25], #0x2\n"
- "st1 { v24.h }[6], [x23], #0x2\n"
- "st1 { v28.h }[6], [x21], #0x2\n"
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[14], [x9]\n"
- "st1 { v20.b }[14], [x25]\n"
- "st1 { v24.b }[14], [x23]\n"
- "st1 { v28.b }[14], [x21]\n"
- "b 122f\n"
- "115:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[12], [x9]\n"
- "st1 { v20.b }[12], [x25]\n"
- "st1 { v24.b }[12], [x23]\n"
- "st1 { v28.b }[12], [x21]\n"
- "b 122f\n"
- "116:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x12, #1, 117f\n"
- "st1 { v16.h }[4], [x9], #0x2\n"
- "st1 { v20.h }[4], [x25], #0x2\n"
- "st1 { v24.h }[4], [x23], #0x2\n"
- "st1 { v28.h }[4], [x21], #0x2\n"
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[10], [x9]\n"
- "st1 { v20.b }[10], [x25]\n"
- "st1 { v24.b }[10], [x23]\n"
- "st1 { v28.b }[10], [x21]\n"
- "b 122f\n"
- "117:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[8], [x9]\n"
- "st1 { v20.b }[8], [x25]\n"
- "st1 { v24.b }[8], [x23]\n"
- "st1 { v28.b }[8], [x21]\n"
- "b 122f\n"
- "118:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x12, #2, 120f\n"
- "str s16, [x9], #0x4\n"
- "str s20, [x25], #0x4\n"
- "str s24, [x23], #0x4\n"
- "str s28, [x21], #0x4\n"
- "tbz x12, #1, 119f\n"
- "st1 { v16.h }[2], [x9], #0x2\n"
- "st1 { v20.h }[2], [x25], #0x2\n"
- "st1 { v24.h }[2], [x23], #0x2\n"
- "st1 { v28.h }[2], [x21], #0x2\n"
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[6], [x9]\n"
- "st1 { v20.b }[6], [x25]\n"
- "st1 { v24.b }[6], [x23]\n"
- "st1 { v28.b }[6], [x21]\n"
- "b 122f\n"
- "119:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[4], [x9]\n"
- "st1 { v20.b }[4], [x25]\n"
- "st1 { v24.b }[4], [x23]\n"
- "st1 { v28.b }[4], [x21]\n"
- "b 122f\n"
- "120:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x12, #1, 121f\n"
- "str h16, [x9], #0x2\n"
- "str h20, [x25], #0x2\n"
- "str h24, [x23], #0x2\n"
- "str h28, [x21], #0x2\n"
- "tbz x12, #0, 122f\n"
- "st1 { v16.b }[2], [x9]\n"
- "st1 { v20.b }[2], [x25]\n"
- "st1 { v24.b }[2], [x23]\n"
- "st1 { v28.b }[2], [x21]\n"
- "b 122f\n"
- "121:" // Height 4: Partial direct writeback: partial_1_0
- "str b16, [x9, #0x0]\n"
- "str b20, [x25, #0x0]\n"
- "str b24, [x23, #0x0]\n"
- "str b28, [x21, #0x0]\n"
- "122:" // Height 4: Partial direct writeback: Done
- "b 124f\n"
- "123:" // Height 4: Full writeback
- "str q16, [x9, #0x0]\n"
- "str q20, [x25, #0x0]\n"
- "str q24, [x23, #0x0]\n"
- "str q28, [x21, #0x0]\n"
- "add x9, x9, #0x10\n"
- "add x25, x25, #0x10\n"
- "add x23, x23, #0x10\n"
- "add x21, x21, #0x10\n"
- "124:" // Height 4: Writeback done
- "subs x12, x12, #0x10\n"
- "bgt 96b\n"
+ "bge 119f\n"
+ "tbz x9, #3, 114f\n"
+ "str d16, [x26], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x9, #2, 112f\n"
+ "st1 { v16.s }[2], [x26], #0x4\n"
+ "st1 { v20.s }[2], [x21], #0x4\n"
+ "st1 { v24.s }[2], [x20], #0x4\n"
+ "st1 { v28.s }[2], [x19], #0x4\n"
+ "tbz x9, #1, 111f\n"
+ "st1 { v16.h }[6], [x26], #0x2\n"
+ "st1 { v20.h }[6], [x21], #0x2\n"
+ "st1 { v24.h }[6], [x20], #0x2\n"
+ "st1 { v28.h }[6], [x19], #0x2\n"
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[14], [x26]\n"
+ "st1 { v20.b }[14], [x21]\n"
+ "st1 { v24.b }[14], [x20]\n"
+ "st1 { v28.b }[14], [x19]\n"
+ "b 118f\n"
+ "111:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[12], [x26]\n"
+ "st1 { v20.b }[12], [x21]\n"
+ "st1 { v24.b }[12], [x20]\n"
+ "st1 { v28.b }[12], [x19]\n"
+ "b 118f\n"
+ "112:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x9, #1, 113f\n"
+ "st1 { v16.h }[4], [x26], #0x2\n"
+ "st1 { v20.h }[4], [x21], #0x2\n"
+ "st1 { v24.h }[4], [x20], #0x2\n"
+ "st1 { v28.h }[4], [x19], #0x2\n"
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[10], [x26]\n"
+ "st1 { v20.b }[10], [x21]\n"
+ "st1 { v24.b }[10], [x20]\n"
+ "st1 { v28.b }[10], [x19]\n"
+ "b 118f\n"
+ "113:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[8], [x26]\n"
+ "st1 { v20.b }[8], [x21]\n"
+ "st1 { v24.b }[8], [x20]\n"
+ "st1 { v28.b }[8], [x19]\n"
+ "b 118f\n"
+ "114:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x9, #2, 116f\n"
+ "str s16, [x26], #0x4\n"
+ "str s20, [x21], #0x4\n"
+ "str s24, [x20], #0x4\n"
+ "str s28, [x19], #0x4\n"
+ "tbz x9, #1, 115f\n"
+ "st1 { v16.h }[2], [x26], #0x2\n"
+ "st1 { v20.h }[2], [x21], #0x2\n"
+ "st1 { v24.h }[2], [x20], #0x2\n"
+ "st1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[6], [x26]\n"
+ "st1 { v20.b }[6], [x21]\n"
+ "st1 { v24.b }[6], [x20]\n"
+ "st1 { v28.b }[6], [x19]\n"
+ "b 118f\n"
+ "115:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[4], [x26]\n"
+ "st1 { v20.b }[4], [x21]\n"
+ "st1 { v24.b }[4], [x20]\n"
+ "st1 { v28.b }[4], [x19]\n"
+ "b 118f\n"
+ "116:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x9, #1, 117f\n"
+ "str h16, [x26], #0x2\n"
+ "str h20, [x21], #0x2\n"
+ "str h24, [x20], #0x2\n"
+ "str h28, [x19], #0x2\n"
+ "tbz x9, #0, 118f\n"
+ "st1 { v16.b }[2], [x26]\n"
+ "st1 { v20.b }[2], [x21]\n"
+ "st1 { v24.b }[2], [x20]\n"
+ "st1 { v28.b }[2], [x19]\n"
+ "b 118f\n"
+ "117:" // Height 4: Partial direct writeback: partial_1_0
+ "str b16, [x26, #0x0]\n"
+ "str b20, [x21, #0x0]\n"
+ "str b24, [x20, #0x0]\n"
+ "str b28, [x19, #0x0]\n"
+ "118:" // Height 4: Partial direct writeback: Done
+ "b 120f\n"
+ "119:" // Height 4: Full writeback
+ "str q16, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "str q20, [x21, #0x0]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q28, [x19, #0x0]\n"
+ "120:" // Height 4: Writeback done
+ "subs x9, x9, #0x10\n"
+ "bgt 92b\n"
"subs %x[M], %x[M], #0x4\n"
- "beq 126f\n"
+ "beq 122f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 125f\n"
+ "tbz %x[flags], #3, 121f\n"
"add x20, x20, #0x4\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "125:" // Update direct input
+ "121:" // Update direct input
"mov x19, #0x4\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "126:" // Exit
+ "122:" // Exit
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index 58fbdcf2a8..da07fc17a1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -37,9 +37,9 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void a64_hybrid_u8u32_dot_6x16( ARGLIST );
+void a64_hybrid_u8u32_dot_6x16_a55( ARGLIST );
class cls_a64_hybrid_u8u32_dot_6x16
{
@@ -72,10 +72,11 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
- static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+ static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+ {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
- return { 9.5238, 2.0799, 0.2279 };
+ return { 12.667, 2.0799, 0.2279 };
default:
return { 29.6736, 11.4025, 0.5591 };
}
@@ -83,9 +84,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_hybrid_u8u32_dot_6x16;
-
- cls_a64_hybrid_u8u32_dot_6x16(const CPUInfo *)
+ cls_a64_hybrid_u8u32_dot_6x16(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A55r1:
+ kernel=a64_hybrid_u8u32_dot_6x16_a55;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
new file mode 100644
index 0000000000..f131eb2eb5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp
@@ -0,0 +1,3499 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_6x16_a55 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+ const uint32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 171f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 137f\n"
+ "beq 103f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 69f\n"
+ "beq 35f\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "tbz %x[flags], #0, 12f\n"
+ "cmp x14, #0x10\n"
+ "bge 11f\n"
+ "tbz x14, #3, 6f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "tbz x14, #2, 4f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "tbz x14, #1, 3f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x12], #0x8\n"
+ "tbz x14, #0, 10f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "b 10f\n"
+ "3:" // Height 1: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 10f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "b 10f\n"
+ "4:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x14, #1, 5f\n"
+ "ldr d10, [x12], #0x8\n"
+ "mov x24, #0x28\n"
+ "tbz x14, #0, 10f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "b 10f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 10f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "b 10f\n"
+ "6:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x14, #2, 8f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "tbz x14, #1, 7f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x12], #0x8\n"
+ "tbz x14, #0, 10f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "b 10f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 10f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "b 10f\n"
+ "8:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x14, #1, 9f\n"
+ "ldr d8, [x12], #0x8\n"
+ "mov x24, #0x8\n"
+ "tbz x14, #0, 10f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "b 10f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "10:" // Height 1: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 13f\n"
+ "11:" // Height 1: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "b 13f\n"
+ "12:" // Height 1: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "13:" // Height 1: setup done
+ "mov x11, #0x0\n"
+ "14:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 15f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "cbnz x11, 16f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "b 16f\n"
+ "15:" // Height 1: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "16:" // Height 1: input setup done
+ "cmp x10, #0x10\n"
+ "blt 19f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "blt 18f\n"
+ "17:" // Height 1: Multiply loop: Main loop head
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x9, x9, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr x28, [x13, #0x28]\n"
+ "cmp x10, #0x20\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr d6, [x13, #0x40]\n"
+ "ldr x28, [x13, #0x48]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0x58]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr d7, [x13, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr d6, [x13, #0x60]\n"
+ "ldr x28, [x13, #0x68]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0x78]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr d6, [x13, #0x80]\n"
+ "ldr x28, [x13, #0x88]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0x98]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr d7, [x13, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "ldr x28, [x13, #0xa8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xb8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xd8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr d7, [x13, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr d6, [x13, #0x0]\n"
+ "ldr x28, [x13, #0x8]\n"
+ "mov v7.d[1], x19\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "bge 17b\n"
+ "18:" // Height 1: Multiply loop: Single iteration only
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x13, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "19:" // Height 1: Multiply loop: Main loop skip
+ "cbz x10, 24f\n"
+ "cmp x10, #0x4\n"
+ "blt 21f\n"
+ "20:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "cmp x10, #0x4\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "bge 20b\n"
+ "cbz x10, 24f\n"
+ "21:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 22f\n"
+ "ldr h0, [x9], #0x2\n"
+ "tbz x10, #0, 23f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "b 23f\n"
+ "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "23:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "24:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 14b\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "bge 33f\n"
+ "tbz x14, #3, 28f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "tbz x14, #2, 26f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "tbz x14, #1, 25f\n"
+ "str d11, [x12], #0x8\n"
+ "tbz x14, #0, 32f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "b 32f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 32f\n"
+ "str s11, [x12, #0x0]\n"
+ "b 32f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 27f\n"
+ "str d10, [x12], #0x8\n"
+ "tbz x14, #0, 32f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "b 32f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 32f\n"
+ "str s10, [x12, #0x0]\n"
+ "b 32f\n"
+ "28:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 30f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "tbz x14, #1, 29f\n"
+ "str d9, [x12], #0x8\n"
+ "tbz x14, #0, 32f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "b 32f\n"
+ "29:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 32f\n"
+ "str s9, [x12, #0x0]\n"
+ "b 32f\n"
+ "30:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 31f\n"
+ "str d8, [x12], #0x8\n"
+ "tbz x14, #0, 32f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "b 32f\n"
+ "31:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "32:" // Height 1: Partial direct writeback: Done
+ "b 34f\n"
+ "33:" // Height 1: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "34:" // Height 1: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 2b\n"
+ "b 206f\n"
+ "35:" // Height 2
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "36:" // Height 2: Column loop
+ "tbz %x[flags], #0, 46f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "bge 45f\n"
+ "tbz x14, #3, 40f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "tbz x14, #2, 38f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "tbz x14, #1, 37f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x12], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "tbz x14, #0, 44f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "b 44f\n"
+ "37:" // Height 2: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 44f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "b 44f\n"
+ "38:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x14, #1, 39f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "tbz x14, #0, 44f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "b 44f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 44f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "b 44f\n"
+ "40:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x14, #2, 42f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "tbz x14, #1, 41f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x12], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "tbz x14, #0, 44f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "b 44f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 44f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "b 44f\n"
+ "42:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x14, #1, 43f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "tbz x14, #0, 44f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "b 44f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "44:" // Height 2: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 47f\n"
+ "45:" // Height 2: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "b 47f\n"
+ "46:" // Height 2: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "47:" // Height 2: setup done
+ "mov x11, #0x0\n"
+ "48:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 49f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "cbnz x11, 50f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "b 50f\n"
+ "49:" // Height 2: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "50:" // Height 2: input setup done
+ "cmp x10, #0x10\n"
+ "blt 53f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 52f\n"
+ "51:" // Height 2: Multiply loop: Main loop head
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "cmp x10, #0x20\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x28]\n"
+ "ldr x19, [x13, #0x38]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x40]\n"
+ "ldr x28, [x13, #0x48]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr d7, [x13, #0x50]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0x58]\n"
+ "ldr x28, [x13, #0x68]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x60]\n"
+ "ldr x19, [x13, #0x78]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0x70]\n"
+ "ldr x28, [x13, #0x88]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x80]\n"
+ "ldr x19, [x13, #0x98]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0x90]\n"
+ "ldr x28, [x13, #0xa8]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "ldr x19, [x13, #0xb8]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xb0]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "ldr x19, [x13, #0xd8]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x19, [x13, #0xf8]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x25, [x26, #0x8]\n"
+ "ldr d6, [x13, #0x0]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr x28, [x13, #0x8]\n"
+ "mov v0.d[1], x27\n"
+ "mov v1.d[1], x25\n"
+ "mov v6.d[1], x28\n"
+ "bge 51b\n"
+ "52:" // Height 2: Multiply loop: Single iteration only
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q6, [x13, #0x20]\n"
+ "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x13, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x13, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "53:" // Height 2: Multiply loop: Main loop skip
+ "cbz x10, 58f\n"
+ "cmp x10, #0x4\n"
+ "blt 55f\n"
+ "54:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "bge 54b\n"
+ "cbz x10, 58f\n"
+ "55:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 56f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "tbz x10, #0, 57f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "b 57f\n"
+ "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "57:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "58:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 48b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "bge 67f\n"
+ "tbz x14, #3, 62f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "tbz x14, #2, 60f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "tbz x14, #1, 59f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "tbz x14, #0, 66f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "b 66f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 66f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "b 66f\n"
+ "60:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 61f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "tbz x14, #0, 66f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "b 66f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 66f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "b 66f\n"
+ "62:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 64f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "tbz x14, #1, 63f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "tbz x14, #0, 66f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "b 66f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 66f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "b 66f\n"
+ "64:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 65f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "tbz x14, #0, 66f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "b 66f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "66:" // Height 2: Partial direct writeback: Done
+ "b 68f\n"
+ "67:" // Height 2: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "68:" // Height 2: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 36b\n"
+ "b 206f\n"
+ "69:" // Height 3
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "70:" // Height 3: Column loop
+ "tbz %x[flags], #0, 80f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "bge 79f\n"
+ "tbz x14, #3, 74f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "tbz x14, #2, 72f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "tbz x14, #1, 71f\n"
+ "ldr d11, [x12], #0x8\n"
+ "mov x24, #0x38\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "tbz x14, #0, 78f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "b 78f\n"
+ "71:" // Height 3: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 78f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "b 78f\n"
+ "72:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x14, #1, 73f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "tbz x14, #0, 78f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "b 78f\n"
+ "73:" // Height 3: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 78f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "b 78f\n"
+ "74:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x14, #2, 76f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "tbz x14, #1, 75f\n"
+ "ldr d9, [x12], #0x8\n"
+ "mov x24, #0x18\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "tbz x14, #0, 78f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "b 78f\n"
+ "75:" // Height 3: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 78f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "b 78f\n"
+ "76:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x14, #1, 77f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "tbz x14, #0, 78f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "b 78f\n"
+ "77:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "78:" // Height 3: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 81f\n"
+ "79:" // Height 3: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "b 81f\n"
+ "80:" // Height 3: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "81:" // Height 3: setup done
+ "mov x11, #0x0\n"
+ "82:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 83f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "cbnz x11, 84f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 84f\n"
+ "83:" // Height 3: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "84:" // Height 3: input setup done
+ "cmp x10, #0x10\n"
+ "blt 87f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 86f\n"
+ "85:" // Height 3: Multiply loop: Main loop head
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x9, x9, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "add x26, x26, #0x10\n"
+ "ldr x28, [x13, #0x28]\n"
+ "add x24, x24, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "sub x10, x10, #0x10\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "cmp x10, #0x20\n"
+ "ldr d6, [x13, #0x40]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x48]\n"
+ "ldr x19, [x13, #0x58]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr d7, [x13, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x60]\n"
+ "ldr x28, [x13, #0x68]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0x78]\n"
+ "ldr x28, [x13, #0x88]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr d6, [x13, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr d7, [x13, #0x90]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0x98]\n"
+ "ldr x28, [x13, #0xa8]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr d6, [x13, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0xb8]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr d6, [x13, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0xd8]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr d6, [x13, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ "ldr x25, [x26, #0x8]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr d2, [x24, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "ldr x21, [x24, #0x8]\n"
+ "mov v1.d[1], x25\n"
+ "ldr d6, [x13, #0x0]\n"
+ "ldr x28, [x13, #0x8]\n"
+ "mov v2.d[1], x21\n"
+ "mov v6.d[1], x28\n"
+ "bge 85b\n"
+ "86:" // Height 3: Multiply loop: Single iteration only
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr q7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x13, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q6, [x13, #0x60]\n"
+ "ldr q7, [x13, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q6, [x13, #0x80]\n"
+ "ldr q7, [x13, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ "87:" // Height 3: Multiply loop: Main loop skip
+ "cbz x10, 92f\n"
+ "cmp x10, #0x4\n"
+ "blt 89f\n"
+ "88:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "bge 88b\n"
+ "cbz x10, 92f\n"
+ "89:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 90f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "tbz x10, #0, 91f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "b 91f\n"
+ "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "91:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "92:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 82b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "bge 101f\n"
+ "tbz x14, #3, 96f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "tbz x14, #2, 94f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "tbz x14, #1, 93f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "tbz x14, #0, 100f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "b 100f\n"
+ "93:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 100f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "b 100f\n"
+ "94:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 95f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "tbz x14, #0, 100f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "b 100f\n"
+ "95:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 100f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "b 100f\n"
+ "96:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 98f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "tbz x14, #1, 97f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "tbz x14, #0, 100f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "b 100f\n"
+ "97:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 100f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "b 100f\n"
+ "98:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 99f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "tbz x14, #0, 100f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "b 100f\n"
+ "99:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "100:" // Height 3: Partial direct writeback: Done
+ "b 102f\n"
+ "101:" // Height 3: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "102:" // Height 3: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 70b\n"
+ "b 206f\n"
+ "103:" // Height 4
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "104:" // Height 4: Column loop
+ "tbz %x[flags], #0, 114f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "bge 113f\n"
+ "tbz x14, #3, 108f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "tbz x14, #2, 106f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "tbz x14, #1, 105f\n"
+ "ldr d11, [x12], #0x8\n"
+ "mov x24, #0x38\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "tbz x14, #0, 112f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "b 112f\n"
+ "105:" // Height 4: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 112f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "b 112f\n"
+ "106:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x14, #1, 107f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "tbz x14, #0, 112f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "b 112f\n"
+ "107:" // Height 4: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 112f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "b 112f\n"
+ "108:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x14, #2, 110f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "tbz x14, #1, 109f\n"
+ "ldr d9, [x12], #0x8\n"
+ "mov x24, #0x18\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "tbz x14, #0, 112f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "b 112f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 112f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "b 112f\n"
+ "110:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x14, #1, 111f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "tbz x14, #0, 112f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "b 112f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "112:" // Height 4: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 115f\n"
+ "113:" // Height 4: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "b 115f\n"
+ "114:" // Height 4: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "115:" // Height 4: setup done
+ "mov x11, #0x0\n"
+ "116:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 117f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "cbnz x11, 118f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 118f\n"
+ "117:" // Height 4: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "add x23, x24, x19\n"
+ "118:" // Height 4: input setup done
+ "cmp x10, #0x10\n"
+ "blt 121f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 120f\n"
+ "119:" // Height 4: Multiply loop: Main loop head
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x28]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "sub x10, x10, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr d6, [x13, #0x40]\n"
+ "cmp x10, #0x20\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr x28, [x13, #0x48]\n"
+ "ldr d7, [x13, #0x50]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "mov v6.d[1], x28\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr x28, [x13, #0x68]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x60]\n"
+ "ldr x19, [x13, #0x78]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr d6, [x13, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr x28, [x13, #0x88]\n"
+ "ldr d7, [x13, #0x90]\n"
+ "ldr x19, [x13, #0x98]\n"
+ "mov v6.d[1], x28\n"
+ "ldr x28, [x13, #0xa8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "ldr x19, [x13, #0xb8]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "ldr x19, [x13, #0xd8]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ "mov v7.d[1], x19\n"
+ "ldr d6, [x13, #0x0]\n"
+ "ldr x28, [x13, #0x8]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "mov v0.d[1], x27\n"
+ "ldr d2, [x24, #0x0]\n"
+ "ldr x21, [x24, #0x8]\n"
+ "mov v1.d[1], x25\n"
+ "ldr d3, [x23, #0x0]\n"
+ "ldr x19, [x23, #0x8]\n"
+ "mov v2.d[1], x21\n"
+ "mov v3.d[1], x19\n"
+ "bge 119b\n"
+ "120:" // Height 4: Multiply loop: Single iteration only
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q6, [x13, #0x20]\n"
+ "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q7, [x13, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q7, [x13, #0x90]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ "121:" // Height 4: Multiply loop: Main loop skip
+ "cbz x10, 126f\n"
+ "cmp x10, #0x4\n"
+ "blt 123f\n"
+ "122:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr s3, [x23], #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "bge 122b\n"
+ "cbz x10, 126f\n"
+ "123:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 124f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "ldr h3, [x23], #0x2\n"
+ "tbz x10, #0, 125f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v3.b }[2], [x23]\n"
+ "b 125f\n"
+ "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "ldr b3, [x23, #0x0]\n"
+ "125:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "126:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 116b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "bge 135f\n"
+ "tbz x14, #3, 130f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "tbz x14, #2, 128f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "tbz x14, #1, 127f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "tbz x14, #0, 134f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "b 134f\n"
+ "127:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 134f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "b 134f\n"
+ "128:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 129f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "tbz x14, #0, 134f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "b 134f\n"
+ "129:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 134f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "b 134f\n"
+ "130:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 132f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "tbz x14, #1, 131f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "tbz x14, #0, 134f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "b 134f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 134f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "b 134f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 133f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x14, #0, 134f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "b 134f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "134:" // Height 4: Partial direct writeback: Done
+ "b 136f\n"
+ "135:" // Height 4: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "136:" // Height 4: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 104b\n"
+ "b 206f\n"
+ "137:" // Height 5
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "138:" // Height 5: Column loop
+ "tbz %x[flags], #0, 148f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "bge 147f\n"
+ "tbz x14, #3, 142f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "ld1 { v25.4s }, [x20], #0x10\n"
+ "tbz x14, #2, 140f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "ld1 { v26.4s }, [x20], #0x10\n"
+ "tbz x14, #1, 139f\n"
+ "ldr d11, [x12], #0x8\n"
+ "mov x24, #0x38\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d27, [x20], #0x8\n"
+ "tbz x14, #0, 146f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "ld1 { v27.s }[2], [x20]\n"
+ "b 146f\n"
+ "139:" // Height 5: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 146f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "ldr s27, [x20, #0x0]\n"
+ "b 146f\n"
+ "140:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x14, #1, 141f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "ldr d26, [x20], #0x8\n"
+ "tbz x14, #0, 146f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "ld1 { v26.s }[2], [x20]\n"
+ "b 146f\n"
+ "141:" // Height 5: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 146f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "ldr s26, [x20, #0x0]\n"
+ "b 146f\n"
+ "142:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x14, #2, 144f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "tbz x14, #1, 143f\n"
+ "ldr d9, [x12], #0x8\n"
+ "mov x24, #0x18\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "tbz x14, #0, 146f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "ld1 { v25.s }[2], [x20]\n"
+ "b 146f\n"
+ "143:" // Height 5: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 146f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "ldr s25, [x20, #0x0]\n"
+ "b 146f\n"
+ "144:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x14, #1, 145f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
+ "tbz x14, #0, 146f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "b 146f\n"
+ "145:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "ldr s24, [x20, #0x0]\n"
+ "146:" // Height 5: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 149f\n"
+ "147:" // Height 5: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "ldr q24, [x20, #0x0]\n"
+ "ldr q25, [x20, #0x10]\n"
+ "ldr q26, [x20, #0x20]\n"
+ "ldr q27, [x20, #0x30]\n"
+ "b 149f\n"
+ "148:" // Height 5: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "149:" // Height 5: setup done
+ "mov x11, #0x0\n"
+ "150:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 151f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x11, 152f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 152f\n"
+ "151:" // Height 5: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "152:" // Height 5: input setup done
+ "cmp x10, #0x10\n"
+ "blt 155f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 154f\n"
+ "153:" // Height 5: Multiply loop: Main loop head
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x9, x9, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "add x26, x26, #0x10\n"
+ "ldr x28, [x13, #0x28]\n"
+ "add x24, x24, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "add x23, x23, #0x10\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "ldr d6, [x13, #0x40]\n"
+ "sub x10, x10, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x48]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "cmp x10, #0x20\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr d7, [x13, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr d6, [x13, #0x60]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x68]\n"
+ "ldr x19, [x13, #0x78]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr d6, [x13, #0x80]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x88]\n"
+ "ldr x19, [x13, #0x98]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr d7, [x13, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0xa8]\n"
+ "ldr x19, [x13, #0xb8]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "ldr x28, [x13, #0xc8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xd8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "ldr x21, [x24, #0x8]\n"
+ "add x13, x13, #0x100\n"
+ "ldr d6, [x13, #0x0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x8]\n"
+ "ldr x19, [x23, #0x8]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr d2, [x24, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "ldr d3, [x23, #0x0]\n"
+ "mov v1.d[1], x25\n"
+ "ldr d4, [x22, #0x0]\n"
+ "mov v2.d[1], x21\n"
+ "ldr x21, [x22, #0x8]\n"
+ "mov v3.d[1], x19\n"
+ "mov v4.d[1], x21\n"
+ "bge 153b\n"
+ "154:" // Height 5: Multiply loop: Single iteration only
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "sub x10, x10, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x23, x23, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "ldr q6, [x13, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q7, [x13, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q7, [x13, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ "155:" // Height 5: Multiply loop: Main loop skip
+ "cbz x10, 160f\n"
+ "cmp x10, #0x4\n"
+ "blt 157f\n"
+ "156:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "bge 156b\n"
+ "cbz x10, 160f\n"
+ "157:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 158f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "tbz x10, #0, 159f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v3.b }[2], [x23]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "b 159f\n"
+ "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "ldr b3, [x23, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "159:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "160:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 150b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "bge 169f\n"
+ "tbz x14, #3, 164f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v25.4s }, [x20], #0x10\n"
+ "tbz x14, #2, 162f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v26.4s }, [x20], #0x10\n"
+ "tbz x14, #1, 161f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "str d27, [x20], #0x8\n"
+ "tbz x14, #0, 168f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "b 168f\n"
+ "161:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 168f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s27, [x20, #0x0]\n"
+ "b 168f\n"
+ "162:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 163f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "str d26, [x20], #0x8\n"
+ "tbz x14, #0, 168f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "b 168f\n"
+ "163:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 168f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "str s26, [x20, #0x0]\n"
+ "b 168f\n"
+ "164:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 166f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "tbz x14, #1, 165f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "str d25, [x20], #0x8\n"
+ "tbz x14, #0, 168f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "b 168f\n"
+ "165:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 168f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "str s25, [x20, #0x0]\n"
+ "b 168f\n"
+ "166:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 167f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x14, #0, 168f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "b 168f\n"
+ "167:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "str s24, [x20, #0x0]\n"
+ "168:" // Height 5: Partial direct writeback: Done
+ "b 170f\n"
+ "169:" // Height 5: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q25, [x20, #0x10]\n"
+ "str q26, [x20, #0x20]\n"
+ "str q27, [x20, #0x30]\n"
+ "170:" // Height 5: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 138b\n"
+ "b 206f\n"
+ "171:" // Height 6
+ "ldr x14, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x12, %x[output_ptr]\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x20, #0x18\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+ "172:" // Height 6: Column loop
+ "tbz %x[flags], #0, 182f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "add x19, x20, x19, LSL #2\n"
+ "bge 181f\n"
+ "tbz x14, #3, 176f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x12], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v28.4s }, [x19], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "ld1 { v25.4s }, [x20], #0x10\n"
+ "ld1 { v29.4s }, [x19], #0x10\n"
+ "tbz x14, #2, 174f\n"
+ "ld1 { v10.4s }, [x12], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "ld1 { v26.4s }, [x20], #0x10\n"
+ "ld1 { v30.4s }, [x19], #0x10\n"
+ "tbz x14, #1, 173f\n"
+ "ldr d11, [x12], #0x8\n"
+ "mov x24, #0x38\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d27, [x20], #0x8\n"
+ "ldr d31, [x19], #0x8\n"
+ "tbz x14, #0, 180f\n"
+ "ld1 { v11.s }[2], [x12]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "ld1 { v27.s }[2], [x20]\n"
+ "ld1 { v31.s }[2], [x19]\n"
+ "b 180f\n"
+ "173:" // Height 6: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x14, #0, 180f\n"
+ "ldr s11, [x12, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "ldr s27, [x20, #0x0]\n"
+ "ldr s31, [x19, #0x0]\n"
+ "b 180f\n"
+ "174:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x14, #1, 175f\n"
+ "ldr d10, [x12], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "ldr d26, [x20], #0x8\n"
+ "ldr d30, [x19], #0x8\n"
+ "tbz x14, #0, 180f\n"
+ "ld1 { v10.s }[2], [x12]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "ld1 { v26.s }[2], [x20]\n"
+ "ld1 { v30.s }[2], [x19]\n"
+ "b 180f\n"
+ "175:" // Height 6: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x14, #0, 180f\n"
+ "ldr s10, [x12, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "ldr s26, [x20, #0x0]\n"
+ "ldr s30, [x19, #0x0]\n"
+ "b 180f\n"
+ "176:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x14, #2, 178f\n"
+ "ld1 { v8.4s }, [x12], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v28.4s }, [x19], #0x10\n"
+ "tbz x14, #1, 177f\n"
+ "ldr d9, [x12], #0x8\n"
+ "mov x24, #0x18\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "ldr d29, [x19], #0x8\n"
+ "tbz x14, #0, 180f\n"
+ "ld1 { v9.s }[2], [x12]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "ld1 { v25.s }[2], [x20]\n"
+ "ld1 { v29.s }[2], [x19]\n"
+ "b 180f\n"
+ "177:" // Height 6: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x14, #0, 180f\n"
+ "ldr s9, [x12, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "ldr s25, [x20, #0x0]\n"
+ "ldr s29, [x19, #0x0]\n"
+ "b 180f\n"
+ "178:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x14, #1, 179f\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
+ "ldr d28, [x19], #0x8\n"
+ "tbz x14, #0, 180f\n"
+ "ld1 { v8.s }[2], [x12]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "ld1 { v28.s }[2], [x19]\n"
+ "b 180f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s8, [x12, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "ldr s24, [x20, #0x0]\n"
+ "ldr s28, [x19, #0x0]\n"
+ "180:" // Height 6: Partial accumulate: Done
+ "sub x12, x12, x24\n"
+ "b 183f\n"
+ "181:" // Height 6: full accumulate
+ "ldr q8, [x12, #0x0]\n"
+ "ldr q9, [x12, #0x10]\n"
+ "ldr q10, [x12, #0x20]\n"
+ "ldr q11, [x12, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "ldr q24, [x20, #0x0]\n"
+ "ldr q25, [x20, #0x10]\n"
+ "ldr q26, [x20, #0x20]\n"
+ "ldr q27, [x20, #0x30]\n"
+ "ldr q28, [x19, #0x0]\n"
+ "ldr q29, [x19, #0x10]\n"
+ "ldr q30, [x19, #0x20]\n"
+ "ldr q31, [x19, #0x30]\n"
+ "b 183f\n"
+ "182:" // Height 6: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "183:" // Height 6: setup done
+ "mov x11, #0x0\n"
+ "184:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w10, [x20, x11, LSL #0x2]\n"
+ "tbz %x[flags], #3, 185f\n"
+ "ldr x20, [%x[input_ptr], x11, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x9, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x24, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x11, 186f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x9, x9, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 186f\n"
+ "185:" // Height 6: setup direct input
+ "mov x9, %x[input_ptr]\n"
+ "add x26, x9, x19\n"
+ "add x24, x26, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x20, x22, x19\n"
+ "186:" // Height 6: input setup done
+ "cmp x10, #0x10\n"
+ "blt 189f\n"
+ "ldr q0, [x9, #0x0]\n"
+ "ldr q1, [x26, #0x0]\n"
+ "cmp x10, #0x20\n"
+ "ldr q2, [x24, #0x0]\n"
+ "ldr q3, [x23, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x13, #0x0]\n"
+ "blt 188f\n"
+ "187:" // Height 6: Multiply loop: Main loop head
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr d7, [x13, #0x10]\n"
+ "add x9, x9, #0x10\n"
+ "ldr x19, [x13, #0x18]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "ldr d6, [x13, #0x20]\n"
+ "add x23, x23, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x28]\n"
+ "ldr x19, [x13, #0x38]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr d7, [x13, #0x30]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "ldr d6, [x13, #0x40]\n"
+ "sub x10, x10, #0x10\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x48]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "cmp x10, #0x20\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr d7, [x13, #0x50]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr d6, [x13, #0x60]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x68]\n"
+ "ldr x19, [x13, #0x78]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr d7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr d6, [x13, #0x80]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x88]\n"
+ "ldr x19, [x13, #0x98]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr d7, [x13, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr d6, [x13, #0xa0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0xa8]\n"
+ "ldr x19, [x13, #0xb8]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr d7, [x13, #0xb0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr d6, [x13, #0xc0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0xc8]\n"
+ "ldr x19, [x13, #0xd8]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr d7, [x13, #0xd0]\n"
+ "ldr d6, [x13, #0xe0]\n"
+ "ldr x28, [x13, #0xe8]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x19, [x13, #0xf8]\n"
+ "ldr x27, [x9, #0x8]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
+ "ldr d7, [x13, #0xf0]\n"
+ "ldr x25, [x26, #0x8]\n"
+ "add x13, x13, #0x100\n"
+ "ldr d6, [x13, #0x0]\n"
+ "mov v7.d[1], x19\n"
+ "ldr x28, [x13, #0x8]\n"
+ "ldr x21, [x24, #0x8]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "mov v6.d[1], x28\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
+ "ldr d0, [x9, #0x0]\n"
+ "ldr d1, [x26, #0x0]\n"
+ "ldr d2, [x24, #0x0]\n"
+ "mov v0.d[1], x27\n"
+ "ldr d3, [x23, #0x0]\n"
+ "mov v1.d[1], x25\n"
+ "ldr x19, [x23, #0x8]\n"
+ "mov v2.d[1], x21\n"
+ "ldr d4, [x22, #0x0]\n"
+ "ldr x21, [x22, #0x8]\n"
+ "mov v3.d[1], x19\n"
+ "ldr d5, [x20, #0x0]\n"
+ "ldr x19, [x20, #0x8]\n"
+ "mov v4.d[1], x21\n"
+ "mov v5.d[1], x19\n"
+ "bge 187b\n"
+ "188:" // Height 6: Multiply loop: Single iteration only
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ "sub x10, x10, #0x10\n"
+ "ldr q6, [x13, #0x20]\n"
+ "add x9, x9, #0x10\n"
+ "prfm pldl1keep, [x9, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x23, x23, #0x10\n"
+ "ldr q6, [x13, #0x40]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q7, [x13, #0x50]\n"
+ "add x20, x20, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x13, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q7, [x13, #0x70]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q6, [x13, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q7, [x13, #0x90]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr q6, [x13, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q7, [x13, #0xb0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr q6, [x13, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q7, [x13, #0xd0]\n"
+ "ldr q6, [x13, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
+ "ldr q7, [x13, #0xf0]\n"
+ "add x13, x13, #0x100\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
+ "189:" // Height 6: Multiply loop: Main loop skip
+ "cbz x10, 194f\n"
+ "cmp x10, #0x4\n"
+ "blt 191f\n"
+ "190:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x9], #0x4\n"
+ "sub x10, x10, #0x4\n"
+ "ldr s1, [x26], #0x4\n"
+ "cmp x10, #0x4\n"
+ "ldr s2, [x24], #0x4\n"
+ "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "bge 190b\n"
+ "cbz x10, 194f\n"
+ "191:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x10, #1, 192f\n"
+ "ldr h0, [x9], #0x2\n"
+ "ldr h1, [x26], #0x2\n"
+ "ldr h2, [x24], #0x2\n"
+ "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "tbz x10, #0, 193f\n"
+ "ld1 { v0.b }[2], [x9]\n"
+ "ld1 { v1.b }[2], [x26]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v3.b }[2], [x23]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 193f\n"
+ "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x9, #0x0]\n"
+ "ldr b1, [x26, #0x0]\n"
+ "ldr b2, [x24, #0x0]\n"
+ "ldr b3, [x23, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "ldr b5, [x20, #0x0]\n"
+ "193:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x13, #0x0]\n"
+ "ldr q7, [x13, #0x10]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q6, [x13, #0x20]\n"
+ "ldr q7, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "194:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x11, x11, #0x1\n"
+ "cmp x11, x19\n"
+ "bne 184b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x12, #0x0]\n"
+ "cmp x14, #0x10\n"
+ "add x23, x12, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19, LSL #2\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
+ "bge 203f\n"
+ "tbz x14, #3, 198f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v9.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v25.4s }, [x20], #0x10\n"
+ "st1 { v28.4s }, [x19], #0x10\n"
+ "st1 { v29.4s }, [x19], #0x10\n"
+ "tbz x14, #2, 196f\n"
+ "st1 { v10.4s }, [x12], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v26.4s }, [x20], #0x10\n"
+ "st1 { v30.4s }, [x19], #0x10\n"
+ "tbz x14, #1, 195f\n"
+ "str d11, [x12], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "str d27, [x20], #0x8\n"
+ "str d31, [x19], #0x8\n"
+ "tbz x14, #0, 202f\n"
+ "st1 { v11.s }[2], [x12]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "st1 { v31.s }[2], [x19]\n"
+ "b 202f\n"
+ "195:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x14, #0, 202f\n"
+ "str s11, [x12, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s27, [x20, #0x0]\n"
+ "str s31, [x19, #0x0]\n"
+ "b 202f\n"
+ "196:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x14, #1, 197f\n"
+ "str d10, [x12], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "str d26, [x20], #0x8\n"
+ "str d30, [x19], #0x8\n"
+ "tbz x14, #0, 202f\n"
+ "st1 { v10.s }[2], [x12]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "st1 { v30.s }[2], [x19]\n"
+ "b 202f\n"
+ "197:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x14, #0, 202f\n"
+ "str s10, [x12, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "str s26, [x20, #0x0]\n"
+ "str s30, [x19, #0x0]\n"
+ "b 202f\n"
+ "198:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x14, #2, 200f\n"
+ "st1 { v8.4s }, [x12], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v28.4s }, [x19], #0x10\n"
+ "tbz x14, #1, 199f\n"
+ "str d9, [x12], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "str d25, [x20], #0x8\n"
+ "str d29, [x19], #0x8\n"
+ "tbz x14, #0, 202f\n"
+ "st1 { v9.s }[2], [x12]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "st1 { v29.s }[2], [x19]\n"
+ "b 202f\n"
+ "199:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x14, #0, 202f\n"
+ "str s9, [x12, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "str s25, [x20, #0x0]\n"
+ "str s29, [x19, #0x0]\n"
+ "b 202f\n"
+ "200:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x14, #1, 201f\n"
+ "str d8, [x12], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x14, #0, 202f\n"
+ "st1 { v8.s }[2], [x12]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "b 202f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x12, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "str s24, [x20, #0x0]\n"
+ "str s28, [x19, #0x0]\n"
+ "202:" // Height 6: Partial direct writeback: Done
+ "b 204f\n"
+ "203:" // Height 6: Full writeback
+ "str q8, [x12, #0x0]\n"
+ "str q9, [x12, #0x10]\n"
+ "str q10, [x12, #0x20]\n"
+ "str q11, [x12, #0x30]\n"
+ "add x12, x12, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q25, [x20, #0x10]\n"
+ "str q26, [x20, #0x20]\n"
+ "str q27, [x20, #0x30]\n"
+ "str q28, [x19, #0x0]\n"
+ "str q29, [x19, #0x10]\n"
+ "str q30, [x19, #0x20]\n"
+ "str q31, [x19, #0x30]\n"
+ "204:" // Height 6: Writeback done
+ "subs x14, x14, #0x10\n"
+ "bgt 172b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 206f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 205f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "205:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "206:" // Exit
+
+ : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
index 89aac84cc5..ba57ad493a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -80,392 +80,378 @@ void a64_hybrid_u8u32_dot_6x16 (
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 176f\n"
+ "bge 171f\n"
"cmp %x[M], #0x4\n"
- "bgt 141f\n"
- "beq 106f\n"
+ "bgt 137f\n"
+ "beq 103f\n"
"cmp %x[M], #0x2\n"
- "bgt 71f\n"
- "beq 36f\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
- "tbz %x[flags], #0, 13f\n"
- "cmp x15, #0x10\n"
- "bge 12f\n"
- "tbz x15, #3, 7f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "tbz x15, #2, 5f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "tbz x15, #1, 4f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "tbz x15, #0, 11f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "b 11f\n"
- "4:" // Height 1: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 11f\n"
- "ldr s11, [x13, #0x0]\n"
- "b 11f\n"
- "5:" // Height 1: Partial accumulate: partial_2_8
- "tbz x15, #1, 6f\n"
- "ldr d10, [x13], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 11f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "b 11f\n"
- "6:" // Height 1: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 11f\n"
- "ldr s10, [x13, #0x0]\n"
- "b 11f\n"
- "7:" // Height 1: Partial accumulate: partial_4_0
- "tbz x15, #2, 9f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "tbz x15, #1, 8f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "tbz x15, #0, 11f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "b 11f\n"
- "8:" // Height 1: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 11f\n"
- "ldr s9, [x13, #0x0]\n"
- "b 11f\n"
- "9:" // Height 1: Partial accumulate: partial_2_0
- "tbz x15, #1, 10f\n"
- "ldr d8, [x13], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 11f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "b 11f\n"
- "10:" // Height 1: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "11:" // Height 1: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "b 14f\n"
- "12:" // Height 1: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "b 14f\n"
- "13:" // Height 1: no accumulate
+ "bgt 69f\n"
+ "beq 35f\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
+ "tbz %x[flags], #0, 12f\n"
+ "cmp x10, #0x10\n"
+ "bge 11f\n"
+ "tbz x10, #3, 6f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "tbz x10, #2, 4f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "tbz x10, #1, 3f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "tbz x10, #0, 10f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "b 10f\n"
+ "3:" // Height 1: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 10f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "b 10f\n"
+ "4:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x10, #1, 5f\n"
+ "ldr d10, [x28], #0x8\n"
+ "mov x24, #0x28\n"
+ "tbz x10, #0, 10f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "b 10f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 10f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "b 10f\n"
+ "6:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x10, #2, 8f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "tbz x10, #1, 7f\n"
+ "ldr d9, [x28], #0x8\n"
+ "mov x24, #0x18\n"
+ "tbz x10, #0, 10f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "b 10f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 10f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "b 10f\n"
+ "8:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x10, #1, 9f\n"
+ "ldr d8, [x28], #0x8\n"
+ "mov x24, #0x8\n"
+ "tbz x10, #0, 10f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "b 10f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "10:" // Height 1: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 13f\n"
+ "11:" // Height 1: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "b 13f\n"
+ "12:" // Height 1: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
"movi v11.4s, #0x0\n"
- "14:" // Height 1: setup done
- "mov x12, #0x0\n"
- "15:" // Height 1: String loop
+ "13:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "14:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 16f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 15f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 17f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 16f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "b 17f\n"
- "16:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "17:" // Height 1: input setup done
- "cmp x11, #0x10\n"
- "blt 20f\n"
- "cmp x11, #0x20\n"
+ "add x25, x25, x19\n"
+ "b 16f\n"
+ "15:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "16:" // Height 1: input setup done
+ "cmp x26, #0x10\n"
"blt 19f\n"
- "18:" // Height 1: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "blt 18f\n"
+ "17:" // Height 1: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x9, #0x30]\n"
+ "cmp x26, #0x20\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q6, [x9, #0x40]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
- "sub x11, x11, #0x10\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
- "cmp x11, #0x20\n"
+ "ldr q7, [x9, #0x70]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- "bge 18b\n"
- "19:" // Height 1: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q0, [x25, #0x0]\n"
+ "bge 17b\n"
+ "18:" // Height 1: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
- "add x10, x10, #0x10\n"
+ "ldr q7, [x9, #0x30]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
- "20:" // Height 1: Multiply loop: Main loop skip
- "cbz x11, 25f\n"
- "cmp x11, #0x4\n"
- "blt 22f\n"
- "21:" // Height 1: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "19:" // Height 1: Multiply loop: Main loop skip
+ "cbz x26, 24f\n"
+ "cmp x26, #0x4\n"
+ "blt 21f\n"
+ "20:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "cmp x26, #0x4\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "sub x11, x11, #0x4\n"
- "add x14, x14, #0x40\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "cmp x11, #0x4\n"
- "bge 21b\n"
- "cbz x11, 25f\n"
- "22:" // Height 1: Multiply loop: Skip odd blocks
- "tbz x11, #1, 23f\n"
- "ldr h0, [x10], #0x2\n"
- "tbz x11, #0, 24f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "b 24f\n"
- "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "24:" // Height 1: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 20b\n"
+ "cbz x26, 24f\n"
+ "21:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 22f\n"
+ "ldr h0, [x25], #0x2\n"
+ "tbz x26, #0, 23f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "b 23f\n"
+ "22:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "23:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
- "25:" // Height 1: Multiply loop: No odd multiplies
+ "24:" // Height 1: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 15b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "cmp x15, #0x10\n"
- "bge 34f\n"
- "tbz x15, #3, 29f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "tbz x15, #2, 27f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "tbz x15, #1, 26f\n"
- "str d11, [x13], #0x8\n"
- "tbz x15, #0, 33f\n"
- "st1 { v11.s }[2], [x13]\n"
- "b 33f\n"
- "26:" // Height 1: Partial direct writeback: partial_1_12
- "tbz x15, #0, 33f\n"
- "str s11, [x13, #0x0]\n"
- "b 33f\n"
- "27:" // Height 1: Partial direct writeback: partial_2_8
- "tbz x15, #1, 28f\n"
- "str d10, [x13], #0x8\n"
- "tbz x15, #0, 33f\n"
- "st1 { v10.s }[2], [x13]\n"
- "b 33f\n"
- "28:" // Height 1: Partial direct writeback: partial_1_8
- "tbz x15, #0, 33f\n"
- "str s10, [x13, #0x0]\n"
- "b 33f\n"
- "29:" // Height 1: Partial direct writeback: partial_4_0
- "tbz x15, #2, 31f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "tbz x15, #1, 30f\n"
- "str d9, [x13], #0x8\n"
- "tbz x15, #0, 33f\n"
- "st1 { v9.s }[2], [x13]\n"
- "b 33f\n"
- "30:" // Height 1: Partial direct writeback: partial_1_4
- "tbz x15, #0, 33f\n"
- "str s9, [x13, #0x0]\n"
- "b 33f\n"
- "31:" // Height 1: Partial direct writeback: partial_2_0
- "tbz x15, #1, 32f\n"
- "str d8, [x13], #0x8\n"
- "tbz x15, #0, 33f\n"
- "st1 { v8.s }[2], [x13]\n"
- "b 33f\n"
- "32:" // Height 1: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "33:" // Height 1: Partial direct writeback: Done
- "b 35f\n"
- "34:" // Height 1: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "add x13, x13, #0x40\n"
- "35:" // Height 1: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 3b\n"
- "b 212f\n"
- "36:" // Height 2
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 14b\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "bge 33f\n"
+ "tbz x10, #3, 28f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "tbz x10, #2, 26f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "tbz x10, #1, 25f\n"
+ "str d11, [x28], #0x8\n"
+ "tbz x10, #0, 32f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "b 32f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 32f\n"
+ "str s11, [x28, #0x0]\n"
+ "b 32f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 27f\n"
+ "str d10, [x28], #0x8\n"
+ "tbz x10, #0, 32f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "b 32f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 32f\n"
+ "str s10, [x28, #0x0]\n"
+ "b 32f\n"
+ "28:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 30f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "tbz x10, #1, 29f\n"
+ "str d9, [x28], #0x8\n"
+ "tbz x10, #0, 32f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "b 32f\n"
+ "29:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 32f\n"
+ "str s9, [x28, #0x0]\n"
+ "b 32f\n"
+ "30:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 31f\n"
+ "str d8, [x28], #0x8\n"
+ "tbz x10, #0, 32f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "b 32f\n"
+ "31:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "32:" // Height 1: Partial direct writeback: Done
+ "b 34f\n"
+ "33:" // Height 1: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "34:" // Height 1: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 2b\n"
+ "b 206f\n"
+ "35:" // Height 2
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "36:" // Height 2: Column loop
+ "tbz %x[flags], #0, 46f\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 37f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #2\n"
- "b 38f\n"
- "37:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "38:" // Height 2: Column loop
- "tbz %x[flags], #0, 48f\n"
- "cmp x15, #0x10\n"
- "bge 47f\n"
- "tbz x15, #3, 42f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "tbz x15, #2, 40f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "tbz x15, #1, 39f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "tbz x15, #0, 46f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "b 46f\n"
- "39:" // Height 2: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 46f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "b 46f\n"
- "40:" // Height 2: Partial accumulate: partial_2_8
- "tbz x15, #1, 41f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 46f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "b 46f\n"
- "41:" // Height 2: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 46f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "b 46f\n"
- "42:" // Height 2: Partial accumulate: partial_4_0
- "tbz x15, #2, 44f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "tbz x15, #1, 43f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "tbz x15, #0, 46f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "b 46f\n"
- "43:" // Height 2: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 46f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "b 46f\n"
- "44:" // Height 2: Partial accumulate: partial_2_0
- "tbz x15, #1, 45f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 46f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "b 46f\n"
- "45:" // Height 2: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "46:" // Height 2: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "b 49f\n"
- "47:" // Height 2: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "b 49f\n"
- "48:" // Height 2: no accumulate
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "bge 45f\n"
+ "tbz x10, #3, 40f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "tbz x10, #2, 38f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "tbz x10, #1, 37f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "tbz x10, #0, 44f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "b 44f\n"
+ "37:" // Height 2: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 44f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "b 44f\n"
+ "38:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x10, #1, 39f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "tbz x10, #0, 44f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "b 44f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 44f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "b 44f\n"
+ "40:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x10, #2, 42f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "tbz x10, #1, 41f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "tbz x10, #0, 44f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "b 44f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 44f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "b 44f\n"
+ "42:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x10, #1, 43f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "tbz x10, #0, 44f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "b 44f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "44:" // Height 2: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 47f\n"
+ "45:" // Height 2: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "b 47f\n"
+ "46:" // Height 2: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -474,405 +460,395 @@ void a64_hybrid_u8u32_dot_6x16 (
"movi v13.4s, #0x0\n"
"movi v14.4s, #0x0\n"
"movi v15.4s, #0x0\n"
- "49:" // Height 2: setup done
- "mov x12, #0x0\n"
- "50:" // Height 2: String loop
+ "47:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "48:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 51f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 49f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 52f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 50f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "b 52f\n"
- "51:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "52:" // Height 2: input setup done
- "cmp x11, #0x10\n"
- "blt 55f\n"
- "cmp x11, #0x20\n"
- "blt 54f\n"
- "53:" // Height 2: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "b 50f\n"
+ "49:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "50:" // Height 2: input setup done
+ "cmp x26, #0x10\n"
+ "blt 53f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 52f\n"
+ "51:" // Height 2: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "cmp x26, #0x20\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
- "cmp x11, #0x20\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- "bge 53b\n"
- "54:" // Height 2: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "bge 51b\n"
+ "52:" // Height 2: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "add x10, x10, #0x10\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
- "55:" // Height 2: Multiply loop: Main loop skip
- "cbz x11, 60f\n"
- "cmp x11, #0x4\n"
- "blt 57f\n"
- "56:" // Height 2: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "53:" // Height 2: Multiply loop: Main loop skip
+ "cbz x26, 58f\n"
+ "cmp x26, #0x4\n"
+ "blt 55f\n"
+ "54:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "sub x11, x11, #0x4\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "bge 56b\n"
- "cbz x11, 60f\n"
- "57:" // Height 2: Multiply loop: Skip odd blocks
- "tbz x11, #1, 58f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "tbz x11, #0, 59f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "b 59f\n"
- "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "59:" // Height 2: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 54b\n"
+ "cbz x26, 58f\n"
+ "55:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 56f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "tbz x26, #0, 57f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "b 57f\n"
+ "56:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "57:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
- "60:" // Height 2: Multiply loop: No odd multiplies
+ "58:" // Height 2: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 50b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "bge 69f\n"
- "tbz x15, #3, 64f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "tbz x15, #2, 62f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "tbz x15, #1, 61f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "tbz x15, #0, 68f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "b 68f\n"
- "61:" // Height 2: Partial direct writeback: partial_1_12
- "tbz x15, #0, 68f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "b 68f\n"
- "62:" // Height 2: Partial direct writeback: partial_2_8
- "tbz x15, #1, 63f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "tbz x15, #0, 68f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "b 68f\n"
- "63:" // Height 2: Partial direct writeback: partial_1_8
- "tbz x15, #0, 68f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "b 68f\n"
- "64:" // Height 2: Partial direct writeback: partial_4_0
- "tbz x15, #2, 66f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "tbz x15, #1, 65f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "tbz x15, #0, 68f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "b 68f\n"
- "65:" // Height 2: Partial direct writeback: partial_1_4
- "tbz x15, #0, 68f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "b 68f\n"
- "66:" // Height 2: Partial direct writeback: partial_2_0
- "tbz x15, #1, 67f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "tbz x15, #0, 68f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 48b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "bge 67f\n"
+ "tbz x10, #3, 62f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "tbz x10, #2, 60f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "tbz x10, #1, 59f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "tbz x10, #0, 66f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "b 66f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 66f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "b 66f\n"
+ "60:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 61f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "tbz x10, #0, 66f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "b 66f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 66f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "b 66f\n"
+ "62:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 64f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "tbz x10, #1, 63f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "tbz x10, #0, 66f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "b 66f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 66f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "b 66f\n"
+ "64:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 65f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "tbz x10, #0, 66f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "b 66f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "66:" // Height 2: Partial direct writeback: Done
"b 68f\n"
- "67:" // Height 2: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "68:" // Height 2: Partial direct writeback: Done
- "b 70f\n"
- "69:" // Height 2: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "70:" // Height 2: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 38b\n"
- "b 212f\n"
- "71:" // Height 3
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "67:" // Height 2: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "68:" // Height 2: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 36b\n"
+ "b 206f\n"
+ "69:" // Height 3
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "70:" // Height 3: Column loop
+ "tbz %x[flags], #0, 80f\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 72f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 73f\n"
- "72:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "73:" // Height 3: Column loop
- "tbz %x[flags], #0, 83f\n"
- "cmp x15, #0x10\n"
- "bge 82f\n"
- "tbz x15, #3, 77f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "tbz x15, #2, 75f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "tbz x15, #1, 74f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "tbz x15, #0, 81f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "b 81f\n"
- "74:" // Height 3: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 81f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "b 81f\n"
- "75:" // Height 3: Partial accumulate: partial_2_8
- "tbz x15, #1, 76f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 81f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "bge 79f\n"
+ "tbz x10, #3, 74f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "tbz x10, #2, 72f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "tbz x10, #1, 71f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "tbz x10, #0, 78f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "b 78f\n"
+ "71:" // Height 3: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 78f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "b 78f\n"
+ "72:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x10, #1, 73f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "tbz x10, #0, 78f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "b 78f\n"
+ "73:" // Height 3: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 78f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "b 78f\n"
+ "74:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x10, #2, 76f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "tbz x10, #1, 75f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "tbz x10, #0, 78f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "b 78f\n"
+ "75:" // Height 3: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 78f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "b 78f\n"
+ "76:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x10, #1, 77f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "tbz x10, #0, 78f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "b 78f\n"
+ "77:" // Height 3: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "78:" // Height 3: Partial accumulate: Done
+ "sub x28, x28, x24\n"
"b 81f\n"
- "76:" // Height 3: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 81f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
+ "79:" // Height 3: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
"b 81f\n"
- "77:" // Height 3: Partial accumulate: partial_4_0
- "tbz x15, #2, 79f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "tbz x15, #1, 78f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "tbz x15, #0, 81f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "b 81f\n"
- "78:" // Height 3: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 81f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "b 81f\n"
- "79:" // Height 3: Partial accumulate: partial_2_0
- "tbz x15, #1, 80f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 81f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "b 81f\n"
- "80:" // Height 3: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "81:" // Height 3: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "b 84f\n"
- "82:" // Height 3: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "b 84f\n"
- "83:" // Height 3: no accumulate
+ "80:" // Height 3: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -885,504 +861,491 @@ void a64_hybrid_u8u32_dot_6x16 (
"movi v17.4s, #0x0\n"
"movi v18.4s, #0x0\n"
"movi v19.4s, #0x0\n"
- "84:" // Height 3: setup done
- "mov x12, #0x0\n"
- "85:" // Height 3: String loop
+ "81:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "82:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 86f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 83f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 87f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 84f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
- "b 87f\n"
- "86:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "87:" // Height 3: input setup done
- "cmp x11, #0x10\n"
- "blt 90f\n"
- "cmp x11, #0x20\n"
- "blt 89f\n"
- "88:" // Height 3: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 84f\n"
+ "83:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "84:" // Height 3: input setup done
+ "cmp x26, #0x10\n"
+ "blt 87f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 86f\n"
+ "85:" // Height 3: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "cmp x26, #0x20\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "sub x11, x11, #0x10\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- "bge 88b\n"
- "89:" // Height 3: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q2, [x23, #0x0]\n"
+ "bge 85b\n"
+ "86:" // Height 3: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x28, x28, #0x10\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q7, [x14, #0x30]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
- "90:" // Height 3: Multiply loop: Main loop skip
- "cbz x11, 95f\n"
- "cmp x11, #0x4\n"
- "blt 92f\n"
- "91:" // Height 3: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "87:" // Height 3: Multiply loop: Main loop skip
+ "cbz x26, 92f\n"
+ "cmp x26, #0x4\n"
+ "blt 89f\n"
+ "88:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
- "cmp x11, #0x4\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "bge 91b\n"
- "cbz x11, 95f\n"
- "92:" // Height 3: Multiply loop: Skip odd blocks
- "tbz x11, #1, 93f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "tbz x11, #0, 94f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "b 94f\n"
- "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "94:" // Height 3: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 88b\n"
+ "cbz x26, 92f\n"
+ "89:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 90f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "tbz x26, #0, 91f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "b 91f\n"
+ "90:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "91:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
- "95:" // Height 3: Multiply loop: No odd multiplies
+ "92:" // Height 3: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 85b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "bge 104f\n"
- "tbz x15, #3, 99f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "tbz x15, #2, 97f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "tbz x15, #1, 96f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "tbz x15, #0, 103f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "b 103f\n"
- "96:" // Height 3: Partial direct writeback: partial_1_12
- "tbz x15, #0, 103f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "b 103f\n"
- "97:" // Height 3: Partial direct writeback: partial_2_8
- "tbz x15, #1, 98f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "tbz x15, #0, 103f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "b 103f\n"
- "98:" // Height 3: Partial direct writeback: partial_1_8
- "tbz x15, #0, 103f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "b 103f\n"
- "99:" // Height 3: Partial direct writeback: partial_4_0
- "tbz x15, #2, 101f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "tbz x15, #1, 100f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "tbz x15, #0, 103f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "b 103f\n"
- "100:" // Height 3: Partial direct writeback: partial_1_4
- "tbz x15, #0, 103f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "b 103f\n"
- "101:" // Height 3: Partial direct writeback: partial_2_0
- "tbz x15, #1, 102f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "tbz x15, #0, 103f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "b 103f\n"
- "102:" // Height 3: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "103:" // Height 3: Partial direct writeback: Done
- "b 105f\n"
- "104:" // Height 3: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "105:" // Height 3: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 73b\n"
- "b 212f\n"
- "106:" // Height 4
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 82b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 107f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 108f\n"
- "107:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "108:" // Height 4: Column loop
- "tbz %x[flags], #0, 118f\n"
- "cmp x15, #0x10\n"
- "bge 117f\n"
- "tbz x15, #3, 112f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "tbz x15, #2, 110f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "tbz x15, #1, 109f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "tbz x15, #0, 116f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "b 116f\n"
- "109:" // Height 4: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 116f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "b 116f\n"
- "110:" // Height 4: Partial accumulate: partial_2_8
- "tbz x15, #1, 111f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 116f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "b 116f\n"
- "111:" // Height 4: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 116f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "b 116f\n"
- "112:" // Height 4: Partial accumulate: partial_4_0
- "tbz x15, #2, 114f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "tbz x15, #1, 113f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "tbz x15, #0, 116f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "b 116f\n"
- "113:" // Height 4: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 116f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "b 116f\n"
- "114:" // Height 4: Partial accumulate: partial_2_0
- "tbz x15, #1, 115f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 116f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "b 116f\n"
- "115:" // Height 4: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "116:" // Height 4: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "b 119f\n"
- "117:" // Height 4: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "b 119f\n"
- "118:" // Height 4: no accumulate
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "bge 101f\n"
+ "tbz x10, #3, 96f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "tbz x10, #2, 94f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "tbz x10, #1, 93f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "tbz x10, #0, 100f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "b 100f\n"
+ "93:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 100f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "b 100f\n"
+ "94:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 95f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "tbz x10, #0, 100f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "b 100f\n"
+ "95:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 100f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "b 100f\n"
+ "96:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 98f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "tbz x10, #1, 97f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "tbz x10, #0, 100f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "b 100f\n"
+ "97:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 100f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "b 100f\n"
+ "98:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 99f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "tbz x10, #0, 100f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "b 100f\n"
+ "99:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "100:" // Height 3: Partial direct writeback: Done
+ "b 102f\n"
+ "101:" // Height 3: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "102:" // Height 3: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 70b\n"
+ "b 206f\n"
+ "103:" // Height 4
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "104:" // Height 4: Column loop
+ "tbz %x[flags], #0, 114f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "bge 113f\n"
+ "tbz x10, #3, 108f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "tbz x10, #2, 106f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "tbz x10, #1, 105f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "tbz x10, #0, 112f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "b 112f\n"
+ "105:" // Height 4: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 112f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "b 112f\n"
+ "106:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x10, #1, 107f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "tbz x10, #0, 112f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "b 112f\n"
+ "107:" // Height 4: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 112f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "b 112f\n"
+ "108:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x10, #2, 110f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "tbz x10, #1, 109f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "tbz x10, #0, 112f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "b 112f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 112f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "b 112f\n"
+ "110:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x10, #1, 111f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "tbz x10, #0, 112f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "b 112f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "112:" // Height 4: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 115f\n"
+ "113:" // Height 4: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "b 115f\n"
+ "114:" // Height 4: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -1399,220 +1362,220 @@ void a64_hybrid_u8u32_dot_6x16 (
"movi v21.4s, #0x0\n"
"movi v22.4s, #0x0\n"
"movi v23.4s, #0x0\n"
- "119:" // Height 4: setup done
- "mov x12, #0x0\n"
- "120:" // Height 4: String loop
+ "115:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "116:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 121f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 117f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 122f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 118f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
- "b 122f\n"
- "121:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "122:" // Height 4: input setup done
- "cmp x11, #0x10\n"
- "blt 125f\n"
- "cmp x11, #0x20\n"
- "blt 124f\n"
- "123:" // Height 4: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 118f\n"
+ "117:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "118:" // Height 4: input setup done
+ "cmp x26, #0x10\n"
+ "blt 121f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 120f\n"
+ "119:" // Height 4: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x26, #0x20\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sub x11, x11, #0x10\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
- "bge 123b\n"
- "124:" // Height 4: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "bge 119b\n"
+ "120:" // Height 4: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x26, x26, #0x10\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
- "add x14, x14, #0x100\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
@@ -1621,31 +1584,31 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
- "125:" // Height 4: Multiply loop: Main loop skip
- "cbz x11, 130f\n"
- "cmp x11, #0x4\n"
- "blt 127f\n"
- "126:" // Height 4: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "121:" // Height 4: Multiply loop: Main loop skip
+ "cbz x26, 126f\n"
+ "cmp x26, #0x4\n"
+ "blt 123f\n"
+ "122:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
@@ -1653,40 +1616,40 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "bge 126b\n"
- "cbz x11, 130f\n"
- "127:" // Height 4: Multiply loop: Skip odd blocks
- "tbz x11, #1, 128f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "tbz x11, #0, 129f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "b 129f\n"
- "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "129:" // Height 4: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 122b\n"
+ "cbz x26, 126f\n"
+ "123:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 124f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "tbz x26, #0, 125f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "ld1 { v3.b }[2], [x22]\n"
+ "b 125f\n"
+ "124:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "ldr b3, [x22, #0x0]\n"
+ "125:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
@@ -1694,308 +1657,292 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
- "130:" // Height 4: Multiply loop: No odd multiplies
+ "126:" // Height 4: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 120b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
- "bge 139f\n"
- "tbz x15, #3, 134f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "tbz x15, #2, 132f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "tbz x15, #1, 131f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "tbz x15, #0, 138f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "b 138f\n"
- "131:" // Height 4: Partial direct writeback: partial_1_12
- "tbz x15, #0, 138f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "b 138f\n"
- "132:" // Height 4: Partial direct writeback: partial_2_8
- "tbz x15, #1, 133f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "tbz x15, #0, 138f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "b 138f\n"
- "133:" // Height 4: Partial direct writeback: partial_1_8
- "tbz x15, #0, 138f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "b 138f\n"
- "134:" // Height 4: Partial direct writeback: partial_4_0
- "tbz x15, #2, 136f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "tbz x15, #1, 135f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "tbz x15, #0, 138f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "b 138f\n"
- "135:" // Height 4: Partial direct writeback: partial_1_4
- "tbz x15, #0, 138f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "b 138f\n"
- "136:" // Height 4: Partial direct writeback: partial_2_0
- "tbz x15, #1, 137f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "tbz x15, #0, 138f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "b 138f\n"
- "137:" // Height 4: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "138:" // Height 4: Partial direct writeback: Done
- "b 140f\n"
- "139:" // Height 4: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "140:" // Height 4: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 108b\n"
- "b 212f\n"
- "141:" // Height 5
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 116b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 142f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 143f\n"
- "142:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "143:" // Height 5: Column loop
- "tbz %x[flags], #0, 153f\n"
- "cmp x15, #0x10\n"
- "bge 152f\n"
- "tbz x15, #3, 147f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "tbz x15, #2, 145f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "tbz x15, #1, 144f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "tbz x15, #0, 151f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "b 151f\n"
- "144:" // Height 5: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 151f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "b 151f\n"
- "145:" // Height 5: Partial accumulate: partial_2_8
- "tbz x15, #1, 146f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 151f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "b 151f\n"
- "146:" // Height 5: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 151f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "b 151f\n"
- "147:" // Height 5: Partial accumulate: partial_4_0
- "tbz x15, #2, 149f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "tbz x15, #1, 148f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "tbz x15, #0, 151f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "b 151f\n"
- "148:" // Height 5: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 151f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "b 151f\n"
- "149:" // Height 5: Partial accumulate: partial_2_0
- "tbz x15, #1, 150f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 151f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "b 151f\n"
- "150:" // Height 5: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "151:" // Height 5: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "b 154f\n"
- "152:" // Height 5: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "b 154f\n"
- "153:" // Height 5: no accumulate
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "bge 135f\n"
+ "tbz x10, #3, 130f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "tbz x10, #2, 128f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "tbz x10, #1, 127f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "tbz x10, #0, 134f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "b 134f\n"
+ "127:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 134f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "b 134f\n"
+ "128:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 129f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "tbz x10, #0, 134f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "b 134f\n"
+ "129:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 134f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "b 134f\n"
+ "130:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 132f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "tbz x10, #1, 131f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "tbz x10, #0, 134f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "b 134f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 134f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "b 134f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 133f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "tbz x10, #0, 134f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "b 134f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "134:" // Height 4: Partial direct writeback: Done
+ "b 136f\n"
+ "135:" // Height 4: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "136:" // Height 4: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 104b\n"
+ "b 206f\n"
+ "137:" // Height 5
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "138:" // Height 5: Column loop
+ "tbz %x[flags], #0, 148f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "bge 147f\n"
+ "tbz x10, #3, 142f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "ld1 { v25.4s }, [x20], #0x10\n"
+ "tbz x10, #2, 140f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "ld1 { v26.4s }, [x20], #0x10\n"
+ "tbz x10, #1, 139f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d27, [x20], #0x8\n"
+ "tbz x10, #0, 146f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "ld1 { v27.s }[2], [x20]\n"
+ "b 146f\n"
+ "139:" // Height 5: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 146f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "ldr s27, [x20, #0x0]\n"
+ "b 146f\n"
+ "140:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x10, #1, 141f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "ldr d26, [x20], #0x8\n"
+ "tbz x10, #0, 146f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "ld1 { v26.s }[2], [x20]\n"
+ "b 146f\n"
+ "141:" // Height 5: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 146f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "ldr s26, [x20, #0x0]\n"
+ "b 146f\n"
+ "142:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x10, #2, 144f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "tbz x10, #1, 143f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "tbz x10, #0, 146f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "ld1 { v25.s }[2], [x20]\n"
+ "b 146f\n"
+ "143:" // Height 5: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 146f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "ldr s25, [x20, #0x0]\n"
+ "b 146f\n"
+ "144:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x10, #1, 145f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
+ "tbz x10, #0, 146f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "b 146f\n"
+ "145:" // Height 5: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "ldr s24, [x20, #0x0]\n"
+ "146:" // Height 5: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 149f\n"
+ "147:" // Height 5: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "ldr q24, [x20, #0x0]\n"
+ "ldr q25, [x20, #0x10]\n"
+ "ldr q26, [x20, #0x20]\n"
+ "ldr q27, [x20, #0x30]\n"
+ "b 149f\n"
+ "148:" // Height 5: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -2016,260 +1963,260 @@ void a64_hybrid_u8u32_dot_6x16 (
"movi v25.4s, #0x0\n"
"movi v26.4s, #0x0\n"
"movi v27.4s, #0x0\n"
- "154:" // Height 5: setup done
- "mov x12, #0x0\n"
- "155:" // Height 5: String loop
+ "149:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "150:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 156f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 151f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 157f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 152f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 157f\n"
- "156:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "157:" // Height 5: input setup done
- "cmp x11, #0x10\n"
- "blt 160f\n"
- "cmp x11, #0x20\n"
- "blt 159f\n"
- "158:" // Height 5: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "add x21, x21, x19\n"
+ "b 152f\n"
+ "151:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "152:" // Height 5: input setup done
+ "cmp x26, #0x10\n"
+ "blt 155f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 154f\n"
+ "153:" // Height 5: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x21, x21, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "cmp x26, #0x20\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
- "sub x11, x11, #0x10\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ "ldr q3, [x22, #0x0]\n"
".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
- "bge 158b\n"
- "159:" // Height 5: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "bge 153b\n"
+ "154:" // Height 5: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x24, x24, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x21, x21, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x22, x22, #0x10\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
@@ -2279,34 +2226,34 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
- "160:" // Height 5: Multiply loop: Main loop skip
- "cbz x11, 165f\n"
- "cmp x11, #0x4\n"
- "blt 162f\n"
- "161:" // Height 5: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "155:" // Height 5: Multiply loop: Main loop skip
+ "cbz x26, 160f\n"
+ "cmp x26, #0x4\n"
+ "blt 157f\n"
+ "156:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
@@ -2316,45 +2263,45 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- "bge 161b\n"
- "cbz x11, 165f\n"
- "162:" // Height 5: Multiply loop: Skip odd blocks
- "tbz x11, #1, 163f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x22], #0x2\n"
- "tbz x11, #0, 164f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x22]\n"
- "b 164f\n"
- "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "ldr b4, [x22, #0x0]\n"
- "164:" // Height 5: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "bge 156b\n"
+ "cbz x26, 160f\n"
+ "157:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 158f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h4, [x21], #0x2\n"
+ "tbz x26, #0, 159f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "ld1 { v3.b }[2], [x22]\n"
+ "ld1 { v4.b }[2], [x21]\n"
+ "b 159f\n"
+ "158:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "ldr b3, [x22, #0x0]\n"
+ "ldr b4, [x21, #0x0]\n"
+ "159:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
@@ -2364,356 +2311,338 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
- "165:" // Height 5: Multiply loop: No odd multiplies
+ "160:" // Height 5: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 155b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 150b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
- "bge 174f\n"
- "tbz x15, #3, 169f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "tbz x15, #2, 167f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "tbz x15, #1, 166f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "tbz x15, #0, 173f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "st1 { v27.s }[2], [x23]\n"
- "b 173f\n"
- "166:" // Height 5: Partial direct writeback: partial_1_12
- "tbz x15, #0, 173f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "b 173f\n"
- "167:" // Height 5: Partial direct writeback: partial_2_8
- "tbz x15, #1, 168f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "tbz x15, #0, 173f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v26.s }[2], [x23]\n"
- "b 173f\n"
- "168:" // Height 5: Partial direct writeback: partial_1_8
- "tbz x15, #0, 173f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "b 173f\n"
- "169:" // Height 5: Partial direct writeback: partial_4_0
- "tbz x15, #2, 171f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "tbz x15, #1, 170f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "tbz x15, #0, 173f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v25.s }[2], [x23]\n"
- "b 173f\n"
- "170:" // Height 5: Partial direct writeback: partial_1_4
- "tbz x15, #0, 173f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "b 173f\n"
- "171:" // Height 5: Partial direct writeback: partial_2_0
- "tbz x15, #1, 172f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "tbz x15, #0, 173f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v24.s }[2], [x23]\n"
- "b 173f\n"
- "172:" // Height 5: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "173:" // Height 5: Partial direct writeback: Done
- "b 175f\n"
- "174:" // Height 5: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "175:" // Height 5: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 143b\n"
- "b 212f\n"
- "176:" // Height 6
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "bge 169f\n"
+ "tbz x10, #3, 164f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v25.4s }, [x20], #0x10\n"
+ "tbz x10, #2, 162f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v26.4s }, [x20], #0x10\n"
+ "tbz x10, #1, 161f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "str d27, [x20], #0x8\n"
+ "tbz x10, #0, 168f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "b 168f\n"
+ "161:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 168f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s27, [x20, #0x0]\n"
+ "b 168f\n"
+ "162:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 163f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "str d26, [x20], #0x8\n"
+ "tbz x10, #0, 168f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "b 168f\n"
+ "163:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 168f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "str s26, [x20, #0x0]\n"
+ "b 168f\n"
+ "164:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 166f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "tbz x10, #1, 165f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "str d25, [x20], #0x8\n"
+ "tbz x10, #0, 168f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "b 168f\n"
+ "165:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 168f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "str s25, [x20, #0x0]\n"
+ "b 168f\n"
+ "166:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 167f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "tbz x10, #0, 168f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "b 168f\n"
+ "167:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "str s24, [x20, #0x0]\n"
+ "168:" // Height 5: Partial direct writeback: Done
+ "b 170f\n"
+ "169:" // Height 5: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q25, [x20, #0x10]\n"
+ "str q26, [x20, #0x20]\n"
+ "str q27, [x20, #0x30]\n"
+ "170:" // Height 5: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 138b\n"
+ "b 206f\n"
+ "171:" // Height 6
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x20, #0x18\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 177f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 178f\n"
- "177:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "178:" // Height 6: Column loop
- "tbz %x[flags], #0, 188f\n"
- "cmp x15, #0x10\n"
- "bge 187f\n"
- "tbz x15, #3, 182f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x21], #0x10\n"
- "ld1 { v9.4s }, [x13], #0x10\n"
- "ld1 { v13.4s }, [x9], #0x10\n"
- "ld1 { v17.4s }, [x27], #0x10\n"
- "ld1 { v21.4s }, [x25], #0x10\n"
- "ld1 { v25.4s }, [x23], #0x10\n"
- "ld1 { v29.4s }, [x21], #0x10\n"
- "tbz x15, #2, 180f\n"
- "ld1 { v10.4s }, [x13], #0x10\n"
- "ld1 { v14.4s }, [x9], #0x10\n"
- "ld1 { v18.4s }, [x27], #0x10\n"
- "ld1 { v22.4s }, [x25], #0x10\n"
- "ld1 { v26.4s }, [x23], #0x10\n"
- "ld1 { v30.4s }, [x21], #0x10\n"
- "tbz x15, #1, 179f\n"
- "mov x19, #0x38\n"
- "ldr d11, [x13], #0x8\n"
- "ldr d15, [x9], #0x8\n"
- "ldr d19, [x27], #0x8\n"
- "ldr d23, [x25], #0x8\n"
- "ldr d27, [x23], #0x8\n"
- "ldr d31, [x21], #0x8\n"
- "tbz x15, #0, 186f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v15.s }[2], [x9]\n"
- "ld1 { v19.s }[2], [x27]\n"
- "ld1 { v23.s }[2], [x25]\n"
- "ld1 { v27.s }[2], [x23]\n"
- "ld1 { v31.s }[2], [x21]\n"
- "b 186f\n"
- "179:" // Height 6: Partial accumulate: partial_1_12
- "mov x19, #0x30\n"
- "tbz x15, #0, 186f\n"
- "ldr s11, [x13, #0x0]\n"
- "ldr s15, [x9, #0x0]\n"
- "ldr s19, [x27, #0x0]\n"
- "ldr s23, [x25, #0x0]\n"
- "ldr s27, [x23, #0x0]\n"
- "ldr s31, [x21, #0x0]\n"
- "b 186f\n"
- "180:" // Height 6: Partial accumulate: partial_2_8
- "tbz x15, #1, 181f\n"
- "ldr d10, [x13], #0x8\n"
- "ldr d14, [x9], #0x8\n"
- "ldr d18, [x27], #0x8\n"
- "ldr d22, [x25], #0x8\n"
- "ldr d26, [x23], #0x8\n"
- "ldr d30, [x21], #0x8\n"
- "mov x19, #0x28\n"
- "tbz x15, #0, 186f\n"
- "ld1 { v10.s }[2], [x13]\n"
- "ld1 { v14.s }[2], [x9]\n"
- "ld1 { v18.s }[2], [x27]\n"
- "ld1 { v22.s }[2], [x25]\n"
- "ld1 { v26.s }[2], [x23]\n"
- "ld1 { v30.s }[2], [x21]\n"
- "b 186f\n"
- "181:" // Height 6: Partial accumulate: partial_1_8
- "mov x19, #0x20\n"
- "tbz x15, #0, 186f\n"
- "ldr s10, [x13, #0x0]\n"
- "ldr s14, [x9, #0x0]\n"
- "ldr s18, [x27, #0x0]\n"
- "ldr s22, [x25, #0x0]\n"
- "ldr s26, [x23, #0x0]\n"
- "ldr s30, [x21, #0x0]\n"
- "b 186f\n"
- "182:" // Height 6: Partial accumulate: partial_4_0
- "tbz x15, #2, 184f\n"
- "ld1 { v8.4s }, [x13], #0x10\n"
- "ld1 { v12.4s }, [x9], #0x10\n"
- "ld1 { v16.4s }, [x27], #0x10\n"
- "ld1 { v20.4s }, [x25], #0x10\n"
- "ld1 { v24.4s }, [x23], #0x10\n"
- "ld1 { v28.4s }, [x21], #0x10\n"
- "tbz x15, #1, 183f\n"
- "mov x19, #0x18\n"
- "ldr d9, [x13], #0x8\n"
- "ldr d13, [x9], #0x8\n"
- "ldr d17, [x27], #0x8\n"
- "ldr d21, [x25], #0x8\n"
- "ldr d25, [x23], #0x8\n"
- "ldr d29, [x21], #0x8\n"
- "tbz x15, #0, 186f\n"
- "ld1 { v9.s }[2], [x13]\n"
- "ld1 { v13.s }[2], [x9]\n"
- "ld1 { v17.s }[2], [x27]\n"
- "ld1 { v21.s }[2], [x25]\n"
- "ld1 { v25.s }[2], [x23]\n"
- "ld1 { v29.s }[2], [x21]\n"
- "b 186f\n"
- "183:" // Height 6: Partial accumulate: partial_1_4
- "mov x19, #0x10\n"
- "tbz x15, #0, 186f\n"
- "ldr s9, [x13, #0x0]\n"
- "ldr s13, [x9, #0x0]\n"
- "ldr s17, [x27, #0x0]\n"
- "ldr s21, [x25, #0x0]\n"
- "ldr s25, [x23, #0x0]\n"
- "ldr s29, [x21, #0x0]\n"
- "b 186f\n"
- "184:" // Height 6: Partial accumulate: partial_2_0
- "tbz x15, #1, 185f\n"
- "ldr d8, [x13], #0x8\n"
- "ldr d12, [x9], #0x8\n"
- "ldr d16, [x27], #0x8\n"
- "ldr d20, [x25], #0x8\n"
- "ldr d24, [x23], #0x8\n"
- "ldr d28, [x21], #0x8\n"
- "mov x19, #0x8\n"
- "tbz x15, #0, 186f\n"
- "ld1 { v8.s }[2], [x13]\n"
- "ld1 { v12.s }[2], [x9]\n"
- "ld1 { v16.s }[2], [x27]\n"
- "ld1 { v20.s }[2], [x25]\n"
- "ld1 { v24.s }[2], [x23]\n"
- "ld1 { v28.s }[2], [x21]\n"
- "b 186f\n"
- "185:" // Height 6: Partial accumulate: partial_1_0
- "mov x19, #0x0\n"
- "ldr s8, [x13, #0x0]\n"
- "ldr s12, [x9, #0x0]\n"
- "ldr s16, [x27, #0x0]\n"
- "ldr s20, [x25, #0x0]\n"
- "ldr s24, [x23, #0x0]\n"
- "ldr s28, [x21, #0x0]\n"
- "186:" // Height 6: Partial accumulate: Done
- "sub x13, x13, x19\n"
- "sub x9, x9, x19\n"
- "sub x27, x27, x19\n"
- "sub x25, x25, x19\n"
- "sub x23, x23, x19\n"
- "sub x21, x21, x19\n"
- "b 189f\n"
- "187:" // Height 6: full accumulate
- "ldr q8, [x13, #0x0]\n"
- "ldr q9, [x13, #0x10]\n"
- "ldr q10, [x13, #0x20]\n"
- "ldr q11, [x13, #0x30]\n"
- "ldr q12, [x9, #0x0]\n"
- "ldr q13, [x9, #0x10]\n"
- "ldr q14, [x9, #0x20]\n"
- "ldr q15, [x9, #0x30]\n"
- "ldr q16, [x27, #0x0]\n"
- "ldr q17, [x27, #0x10]\n"
- "ldr q18, [x27, #0x20]\n"
- "ldr q19, [x27, #0x30]\n"
- "ldr q20, [x25, #0x0]\n"
- "ldr q21, [x25, #0x10]\n"
- "ldr q22, [x25, #0x20]\n"
- "ldr q23, [x25, #0x30]\n"
- "ldr q24, [x23, #0x0]\n"
- "ldr q25, [x23, #0x10]\n"
- "ldr q26, [x23, #0x20]\n"
- "ldr q27, [x23, #0x30]\n"
- "ldr q28, [x21, #0x0]\n"
- "ldr q29, [x21, #0x10]\n"
- "ldr q30, [x21, #0x20]\n"
- "ldr q31, [x21, #0x30]\n"
- "b 189f\n"
- "188:" // Height 6: no accumulate
+ "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+ "172:" // Height 6: Column loop
+ "tbz %x[flags], #0, 182f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "add x19, x20, x19, LSL #2\n"
+ "bge 181f\n"
+ "tbz x10, #3, 176f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v28.4s }, [x19], #0x10\n"
+ "ld1 { v9.4s }, [x28], #0x10\n"
+ "ld1 { v13.4s }, [x23], #0x10\n"
+ "ld1 { v17.4s }, [x22], #0x10\n"
+ "ld1 { v21.4s }, [x21], #0x10\n"
+ "ld1 { v25.4s }, [x20], #0x10\n"
+ "ld1 { v29.4s }, [x19], #0x10\n"
+ "tbz x10, #2, 174f\n"
+ "ld1 { v10.4s }, [x28], #0x10\n"
+ "ld1 { v14.4s }, [x23], #0x10\n"
+ "ld1 { v18.4s }, [x22], #0x10\n"
+ "ld1 { v22.4s }, [x21], #0x10\n"
+ "ld1 { v26.4s }, [x20], #0x10\n"
+ "ld1 { v30.4s }, [x19], #0x10\n"
+ "tbz x10, #1, 173f\n"
+ "mov x24, #0x38\n"
+ "ldr d11, [x28], #0x8\n"
+ "ldr d15, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d27, [x20], #0x8\n"
+ "ldr d31, [x19], #0x8\n"
+ "tbz x10, #0, 180f\n"
+ "ld1 { v11.s }[2], [x28]\n"
+ "ld1 { v15.s }[2], [x23]\n"
+ "ld1 { v19.s }[2], [x22]\n"
+ "ld1 { v23.s }[2], [x21]\n"
+ "ld1 { v27.s }[2], [x20]\n"
+ "ld1 { v31.s }[2], [x19]\n"
+ "b 180f\n"
+ "173:" // Height 6: Partial accumulate: partial_1_12
+ "mov x24, #0x30\n"
+ "tbz x10, #0, 180f\n"
+ "ldr s11, [x28, #0x0]\n"
+ "ldr s15, [x23, #0x0]\n"
+ "ldr s19, [x22, #0x0]\n"
+ "ldr s23, [x21, #0x0]\n"
+ "ldr s27, [x20, #0x0]\n"
+ "ldr s31, [x19, #0x0]\n"
+ "b 180f\n"
+ "174:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x10, #1, 175f\n"
+ "ldr d10, [x28], #0x8\n"
+ "ldr d14, [x23], #0x8\n"
+ "mov x24, #0x28\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "ldr d26, [x20], #0x8\n"
+ "ldr d30, [x19], #0x8\n"
+ "tbz x10, #0, 180f\n"
+ "ld1 { v10.s }[2], [x28]\n"
+ "ld1 { v14.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v22.s }[2], [x21]\n"
+ "ld1 { v26.s }[2], [x20]\n"
+ "ld1 { v30.s }[2], [x19]\n"
+ "b 180f\n"
+ "175:" // Height 6: Partial accumulate: partial_1_8
+ "mov x24, #0x20\n"
+ "tbz x10, #0, 180f\n"
+ "ldr s10, [x28, #0x0]\n"
+ "ldr s14, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s22, [x21, #0x0]\n"
+ "ldr s26, [x20, #0x0]\n"
+ "ldr s30, [x19, #0x0]\n"
+ "b 180f\n"
+ "176:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x10, #2, 178f\n"
+ "ld1 { v8.4s }, [x28], #0x10\n"
+ "ld1 { v12.4s }, [x23], #0x10\n"
+ "ld1 { v16.4s }, [x22], #0x10\n"
+ "ld1 { v20.4s }, [x21], #0x10\n"
+ "ld1 { v24.4s }, [x20], #0x10\n"
+ "ld1 { v28.4s }, [x19], #0x10\n"
+ "tbz x10, #1, 177f\n"
+ "mov x24, #0x18\n"
+ "ldr d9, [x28], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d17, [x22], #0x8\n"
+ "ldr d21, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "ldr d29, [x19], #0x8\n"
+ "tbz x10, #0, 180f\n"
+ "ld1 { v9.s }[2], [x28]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v17.s }[2], [x22]\n"
+ "ld1 { v21.s }[2], [x21]\n"
+ "ld1 { v25.s }[2], [x20]\n"
+ "ld1 { v29.s }[2], [x19]\n"
+ "b 180f\n"
+ "177:" // Height 6: Partial accumulate: partial_1_4
+ "mov x24, #0x10\n"
+ "tbz x10, #0, 180f\n"
+ "ldr s9, [x28, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s17, [x22, #0x0]\n"
+ "ldr s21, [x21, #0x0]\n"
+ "ldr s25, [x20, #0x0]\n"
+ "ldr s29, [x19, #0x0]\n"
+ "b 180f\n"
+ "178:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x10, #1, 179f\n"
+ "ldr d8, [x28], #0x8\n"
+ "ldr d12, [x23], #0x8\n"
+ "mov x24, #0x8\n"
+ "ldr d16, [x22], #0x8\n"
+ "ldr d20, [x21], #0x8\n"
+ "ldr d24, [x20], #0x8\n"
+ "ldr d28, [x19], #0x8\n"
+ "tbz x10, #0, 180f\n"
+ "ld1 { v8.s }[2], [x28]\n"
+ "ld1 { v12.s }[2], [x23]\n"
+ "ld1 { v16.s }[2], [x22]\n"
+ "ld1 { v20.s }[2], [x21]\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "ld1 { v28.s }[2], [x19]\n"
+ "b 180f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_0
+ "ldr s8, [x28, #0x0]\n"
+ "mov x24, #0x0\n"
+ "ldr s12, [x23, #0x0]\n"
+ "ldr s16, [x22, #0x0]\n"
+ "ldr s20, [x21, #0x0]\n"
+ "ldr s24, [x20, #0x0]\n"
+ "ldr s28, [x19, #0x0]\n"
+ "180:" // Height 6: Partial accumulate: Done
+ "sub x28, x28, x24\n"
+ "b 183f\n"
+ "181:" // Height 6: full accumulate
+ "ldr q8, [x28, #0x0]\n"
+ "ldr q9, [x28, #0x10]\n"
+ "ldr q10, [x28, #0x20]\n"
+ "ldr q11, [x28, #0x30]\n"
+ "ldr q12, [x23, #0x0]\n"
+ "ldr q13, [x23, #0x10]\n"
+ "ldr q14, [x23, #0x20]\n"
+ "ldr q15, [x23, #0x30]\n"
+ "ldr q16, [x22, #0x0]\n"
+ "ldr q17, [x22, #0x10]\n"
+ "ldr q18, [x22, #0x20]\n"
+ "ldr q19, [x22, #0x30]\n"
+ "ldr q20, [x21, #0x0]\n"
+ "ldr q21, [x21, #0x10]\n"
+ "ldr q22, [x21, #0x20]\n"
+ "ldr q23, [x21, #0x30]\n"
+ "ldr q24, [x20, #0x0]\n"
+ "ldr q25, [x20, #0x10]\n"
+ "ldr q26, [x20, #0x20]\n"
+ "ldr q27, [x20, #0x30]\n"
+ "ldr q28, [x19, #0x0]\n"
+ "ldr q29, [x19, #0x10]\n"
+ "ldr q30, [x19, #0x20]\n"
+ "ldr q31, [x19, #0x30]\n"
+ "b 183f\n"
+ "182:" // Height 6: no accumulate
"movi v8.4s, #0x0\n"
"movi v9.4s, #0x0\n"
"movi v10.4s, #0x0\n"
@@ -2738,299 +2667,299 @@ void a64_hybrid_u8u32_dot_6x16 (
"movi v29.4s, #0x0\n"
"movi v30.4s, #0x0\n"
"movi v31.4s, #0x0\n"
- "189:" // Height 6: setup done
- "mov x12, #0x0\n"
- "190:" // Height 6: String loop
+ "183:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "184:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 191f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 185f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 192f\n"
+ "cbnz x27, 186f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 192f\n"
- "191:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "192:" // Height 6: input setup done
- "cmp x11, #0x10\n"
- "blt 195f\n"
- "cmp x11, #0x20\n"
- "blt 194f\n"
- "193:" // Height 6: Multiply loop: Main loop head
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
+ "b 186f\n"
+ "185:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "186:" // Height 6: input setup done
+ "cmp x26, #0x10\n"
+ "blt 189f\n"
+ "ldr q0, [x25, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "cmp x26, #0x20\n"
+ "ldr q2, [x23, #0x0]\n"
+ "ldr q3, [x22, #0x0]\n"
+ "ldr q4, [x21, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
+ "blt 188f\n"
+ "187:" // Height 6: Multiply loop: Main loop head
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "add x25, x25, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x21, x21, #0x10\n"
".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
+ "add x20, x20, #0x10\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
- "add x22, x22, #0x10\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
- "add x20, x20, #0x10\n"
- ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "sub x11, x11, #0x10\n"
+ "cmp x26, #0x20\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
- "cmp x11, #0x20\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "ldr q0, [x25, #0x0]\n"
".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "ldr q1, [x24, #0x0]\n"
".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ "ldr q2, [x23, #0x0]\n"
".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ "ldr q3, [x22, #0x0]\n"
".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ "ldr q4, [x21, #0x0]\n"
".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
- "bge 193b\n"
- "194:" // Height 6: Multiply loop: Single iteration only
- "sub x11, x11, #0x10\n"
- "ldr q0, [x10, #0x0]\n"
- "ldr q1, [x28, #0x0]\n"
- "ldr q2, [x26, #0x0]\n"
- "ldr q3, [x24, #0x0]\n"
- "ldr q4, [x22, #0x0]\n"
"ldr q5, [x20, #0x0]\n"
- "ldr q6, [x14, #0x0]\n"
+ "bge 187b\n"
+ "188:" // Height 6: Multiply loop: Single iteration only
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
+ "sub x26, x26, #0x10\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "add x10, x10, #0x10\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "add x25, x25, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "add x28, x28, #0x10\n"
- ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "add x26, x26, #0x10\n"
- ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
"add x24, x24, #0x10\n"
- ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
"prfm pldl1keep, [x24, #0x80]\n"
- "ldr q6, [x14, #0x20]\n"
- ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x23, x23, #0x10\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"add x22, x22, #0x10\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
- ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x21, x21, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q6, [x9, #0x20]\n"
"add x20, x20, #0x10\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x40]\n"
+ "ldr q6, [x9, #0x40]\n"
".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x50]\n"
+ "ldr q7, [x9, #0x50]\n"
".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x60]\n"
+ "ldr q6, [x9, #0x60]\n"
".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x70]\n"
+ "ldr q7, [x9, #0x70]\n"
".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
- "ldr q6, [x14, #0x80]\n"
+ "ldr q6, [x9, #0x80]\n"
".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
- "ldr q7, [x14, #0x90]\n"
+ "ldr q7, [x9, #0x90]\n"
".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xa0]\n"
+ "ldr q6, [x9, #0xa0]\n"
".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xb0]\n"
+ "ldr q7, [x9, #0xb0]\n"
".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
- "ldr q6, [x14, #0xc0]\n"
+ "ldr q6, [x9, #0xc0]\n"
".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
- "ldr q7, [x14, #0xd0]\n"
+ "ldr q7, [x9, #0xd0]\n"
".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
- "ldr q6, [x14, #0xe0]\n"
+ "ldr q6, [x9, #0xe0]\n"
".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
- "ldr q7, [x14, #0xf0]\n"
+ "ldr q7, [x9, #0xf0]\n"
+ "add x9, x9, #0x100\n"
".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
- "add x14, x14, #0x100\n"
".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
@@ -3042,37 +2971,37 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
- "195:" // Height 6: Multiply loop: Main loop skip
- "cbz x11, 200f\n"
- "cmp x11, #0x4\n"
- "blt 197f\n"
- "196:" // Height 6: Multiply loop: Odd block loop
- "ldr s0, [x10], #0x4\n"
- "ldr s1, [x28], #0x4\n"
- "ldr s2, [x26], #0x4\n"
- "ldr s3, [x24], #0x4\n"
- "ldr s4, [x22], #0x4\n"
+ "189:" // Height 6: Multiply loop: Main loop skip
+ "cbz x26, 194f\n"
+ "cmp x26, #0x4\n"
+ "blt 191f\n"
+ "190:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x25], #0x4\n"
+ "sub x26, x26, #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "cmp x26, #0x4\n"
+ "ldr s2, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s4, [x21], #0x4\n"
"ldr s5, [x20], #0x4\n"
- "ldr q6, [x14, #0x0]\n"
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
- "sub x11, x11, #0x4\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
- "cmp x11, #0x4\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
@@ -3084,50 +3013,50 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
- "bge 196b\n"
- "cbz x11, 200f\n"
- "197:" // Height 6: Multiply loop: Skip odd blocks
- "tbz x11, #1, 198f\n"
- "ldr h0, [x10], #0x2\n"
- "ldr h1, [x28], #0x2\n"
- "ldr h2, [x26], #0x2\n"
- "ldr h3, [x24], #0x2\n"
- "ldr h4, [x22], #0x2\n"
+ "bge 190b\n"
+ "cbz x26, 194f\n"
+ "191:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x26, #1, 192f\n"
+ "ldr h0, [x25], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h4, [x21], #0x2\n"
"ldr h5, [x20], #0x2\n"
- "tbz x11, #0, 199f\n"
- "ld1 { v0.b }[2], [x10]\n"
- "ld1 { v1.b }[2], [x28]\n"
- "ld1 { v2.b }[2], [x26]\n"
- "ld1 { v3.b }[2], [x24]\n"
- "ld1 { v4.b }[2], [x22]\n"
+ "tbz x26, #0, 193f\n"
+ "ld1 { v0.b }[2], [x25]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x23]\n"
+ "ld1 { v3.b }[2], [x22]\n"
+ "ld1 { v4.b }[2], [x21]\n"
"ld1 { v5.b }[2], [x20]\n"
- "b 199f\n"
- "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
- "ldr b0, [x10, #0x0]\n"
- "ldr b1, [x28, #0x0]\n"
- "ldr b2, [x26, #0x0]\n"
- "ldr b3, [x24, #0x0]\n"
- "ldr b4, [x22, #0x0]\n"
+ "b 193f\n"
+ "192:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x25, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x23, #0x0]\n"
+ "ldr b3, [x22, #0x0]\n"
+ "ldr b4, [x21, #0x0]\n"
"ldr b5, [x20, #0x0]\n"
- "199:" // Height 6: Multiply loop: Ragged operand read: Done
- "ldr q6, [x14, #0x0]\n"
+ "193:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x9, #0x0]\n"
".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
- "ldr q7, [x14, #0x10]\n"
+ "ldr q7, [x9, #0x10]\n"
".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
- "ldr q6, [x14, #0x20]\n"
+ "ldr q6, [x9, #0x20]\n"
".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
- "ldr q7, [x14, #0x30]\n"
+ "ldr q7, [x9, #0x30]\n"
+ "add x9, x9, #0x40\n"
".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "add x14, x14, #0x40\n"
".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
@@ -3139,195 +3068,196 @@ void a64_hybrid_u8u32_dot_6x16 (
".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
- "200:" // Height 6: Multiply loop: No odd multiplies
+ "194:" // Height 6: Multiply loop: No odd multiplies
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "add x12, x12, #0x1\n"
- "cmp x12, x19\n"
- "bne 190b\n"
- "prfm pstl1keep, [x13, #0x0]\n"
- "prfm pstl1keep, [x9, #0x0]\n"
- "cmp x15, #0x10\n"
- "prfm pstl1keep, [x27, #0x0]\n"
- "prfm pstl1keep, [x25, #0x0]\n"
+ "add x27, x27, #0x1\n"
+ "cmp x27, x19\n"
+ "bne 184b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "prfm pstl1keep, [x28, #0x0]\n"
+ "cmp x10, #0x10\n"
+ "add x23, x28, x19, LSL #2\n"
"prfm pstl1keep, [x23, #0x0]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "prfm pstl1keep, [x22, #0x0]\n"
+ "add x21, x22, x19, LSL #2\n"
"prfm pstl1keep, [x21, #0x0]\n"
- "bge 209f\n"
- "tbz x15, #3, 204f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v9.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v13.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v17.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v21.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v25.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x21], #0x10\n"
- "st1 { v29.4s }, [x21], #0x10\n"
- "tbz x15, #2, 202f\n"
- "st1 { v10.4s }, [x13], #0x10\n"
- "st1 { v14.4s }, [x9], #0x10\n"
- "st1 { v18.4s }, [x27], #0x10\n"
- "st1 { v22.4s }, [x25], #0x10\n"
- "st1 { v26.4s }, [x23], #0x10\n"
- "st1 { v30.4s }, [x21], #0x10\n"
- "tbz x15, #1, 201f\n"
- "str d11, [x13], #0x8\n"
- "str d15, [x9], #0x8\n"
- "str d19, [x27], #0x8\n"
- "str d23, [x25], #0x8\n"
- "str d27, [x23], #0x8\n"
- "str d31, [x21], #0x8\n"
- "tbz x15, #0, 208f\n"
- "st1 { v11.s }[2], [x13]\n"
- "st1 { v15.s }[2], [x9]\n"
- "st1 { v19.s }[2], [x27]\n"
- "st1 { v23.s }[2], [x25]\n"
- "st1 { v27.s }[2], [x23]\n"
- "st1 { v31.s }[2], [x21]\n"
- "b 208f\n"
- "201:" // Height 6: Partial direct writeback: partial_1_12
- "tbz x15, #0, 208f\n"
- "str s11, [x13, #0x0]\n"
- "str s15, [x9, #0x0]\n"
- "str s19, [x27, #0x0]\n"
- "str s23, [x25, #0x0]\n"
- "str s27, [x23, #0x0]\n"
- "str s31, [x21, #0x0]\n"
- "b 208f\n"
- "202:" // Height 6: Partial direct writeback: partial_2_8
- "tbz x15, #1, 203f\n"
- "str d10, [x13], #0x8\n"
- "str d14, [x9], #0x8\n"
- "str d18, [x27], #0x8\n"
- "str d22, [x25], #0x8\n"
- "str d26, [x23], #0x8\n"
- "str d30, [x21], #0x8\n"
- "tbz x15, #0, 208f\n"
- "st1 { v10.s }[2], [x13]\n"
- "st1 { v14.s }[2], [x9]\n"
- "st1 { v18.s }[2], [x27]\n"
- "st1 { v22.s }[2], [x25]\n"
- "st1 { v26.s }[2], [x23]\n"
- "st1 { v30.s }[2], [x21]\n"
- "b 208f\n"
- "203:" // Height 6: Partial direct writeback: partial_1_8
- "tbz x15, #0, 208f\n"
- "str s10, [x13, #0x0]\n"
- "str s14, [x9, #0x0]\n"
- "str s18, [x27, #0x0]\n"
- "str s22, [x25, #0x0]\n"
- "str s26, [x23, #0x0]\n"
- "str s30, [x21, #0x0]\n"
- "b 208f\n"
- "204:" // Height 6: Partial direct writeback: partial_4_0
- "tbz x15, #2, 206f\n"
- "st1 { v8.4s }, [x13], #0x10\n"
- "st1 { v12.4s }, [x9], #0x10\n"
- "st1 { v16.4s }, [x27], #0x10\n"
- "st1 { v20.4s }, [x25], #0x10\n"
- "st1 { v24.4s }, [x23], #0x10\n"
- "st1 { v28.4s }, [x21], #0x10\n"
- "tbz x15, #1, 205f\n"
- "str d9, [x13], #0x8\n"
- "str d13, [x9], #0x8\n"
- "str d17, [x27], #0x8\n"
- "str d21, [x25], #0x8\n"
- "str d25, [x23], #0x8\n"
- "str d29, [x21], #0x8\n"
- "tbz x15, #0, 208f\n"
- "st1 { v9.s }[2], [x13]\n"
- "st1 { v13.s }[2], [x9]\n"
- "st1 { v17.s }[2], [x27]\n"
- "st1 { v21.s }[2], [x25]\n"
- "st1 { v25.s }[2], [x23]\n"
- "st1 { v29.s }[2], [x21]\n"
- "b 208f\n"
- "205:" // Height 6: Partial direct writeback: partial_1_4
- "tbz x15, #0, 208f\n"
- "str s9, [x13, #0x0]\n"
- "str s13, [x9, #0x0]\n"
- "str s17, [x27, #0x0]\n"
- "str s21, [x25, #0x0]\n"
- "str s25, [x23, #0x0]\n"
- "str s29, [x21, #0x0]\n"
- "b 208f\n"
- "206:" // Height 6: Partial direct writeback: partial_2_0
- "tbz x15, #1, 207f\n"
- "str d8, [x13], #0x8\n"
- "str d12, [x9], #0x8\n"
- "str d16, [x27], #0x8\n"
- "str d20, [x25], #0x8\n"
- "str d24, [x23], #0x8\n"
- "str d28, [x21], #0x8\n"
- "tbz x15, #0, 208f\n"
- "st1 { v8.s }[2], [x13]\n"
- "st1 { v12.s }[2], [x9]\n"
- "st1 { v16.s }[2], [x27]\n"
- "st1 { v20.s }[2], [x25]\n"
- "st1 { v24.s }[2], [x23]\n"
- "st1 { v28.s }[2], [x21]\n"
- "b 208f\n"
- "207:" // Height 6: Partial direct writeback: partial_1_0
- "str s8, [x13, #0x0]\n"
- "str s12, [x9, #0x0]\n"
- "str s16, [x27, #0x0]\n"
- "str s20, [x25, #0x0]\n"
- "str s24, [x23, #0x0]\n"
- "str s28, [x21, #0x0]\n"
- "208:" // Height 6: Partial direct writeback: Done
- "b 210f\n"
- "209:" // Height 6: Full writeback
- "str q8, [x13, #0x0]\n"
- "str q9, [x13, #0x10]\n"
- "str q10, [x13, #0x20]\n"
- "str q11, [x13, #0x30]\n"
- "str q12, [x9, #0x0]\n"
- "str q13, [x9, #0x10]\n"
- "str q14, [x9, #0x20]\n"
- "str q15, [x9, #0x30]\n"
- "str q16, [x27, #0x0]\n"
- "str q17, [x27, #0x10]\n"
- "str q18, [x27, #0x20]\n"
- "str q19, [x27, #0x30]\n"
- "str q20, [x25, #0x0]\n"
- "str q21, [x25, #0x10]\n"
- "str q22, [x25, #0x20]\n"
- "str q23, [x25, #0x30]\n"
- "str q24, [x23, #0x0]\n"
- "str q25, [x23, #0x10]\n"
- "str q26, [x23, #0x20]\n"
- "str q27, [x23, #0x30]\n"
- "str q28, [x21, #0x0]\n"
- "str q29, [x21, #0x10]\n"
- "str q30, [x21, #0x20]\n"
- "str q31, [x21, #0x30]\n"
- "add x13, x13, #0x40\n"
- "add x9, x9, #0x40\n"
- "add x27, x27, #0x40\n"
- "add x25, x25, #0x40\n"
- "add x23, x23, #0x40\n"
- "add x21, x21, #0x40\n"
- "210:" // Height 6: Writeback done
- "subs x15, x15, #0x10\n"
- "bgt 178b\n"
+ "add x20, x21, x19, LSL #2\n"
+ "prfm pstl1keep, [x20, #0x0]\n"
+ "add x19, x20, x19, LSL #2\n"
+ "prfm pstl1keep, [x19, #0x0]\n"
+ "bge 203f\n"
+ "tbz x10, #3, 198f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v9.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v13.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v17.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v21.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v25.4s }, [x20], #0x10\n"
+ "st1 { v28.4s }, [x19], #0x10\n"
+ "st1 { v29.4s }, [x19], #0x10\n"
+ "tbz x10, #2, 196f\n"
+ "st1 { v10.4s }, [x28], #0x10\n"
+ "st1 { v14.4s }, [x23], #0x10\n"
+ "st1 { v18.4s }, [x22], #0x10\n"
+ "st1 { v22.4s }, [x21], #0x10\n"
+ "st1 { v26.4s }, [x20], #0x10\n"
+ "st1 { v30.4s }, [x19], #0x10\n"
+ "tbz x10, #1, 195f\n"
+ "str d11, [x28], #0x8\n"
+ "str d15, [x23], #0x8\n"
+ "str d19, [x22], #0x8\n"
+ "str d23, [x21], #0x8\n"
+ "str d27, [x20], #0x8\n"
+ "str d31, [x19], #0x8\n"
+ "tbz x10, #0, 202f\n"
+ "st1 { v11.s }[2], [x28]\n"
+ "st1 { v15.s }[2], [x23]\n"
+ "st1 { v19.s }[2], [x22]\n"
+ "st1 { v23.s }[2], [x21]\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "st1 { v31.s }[2], [x19]\n"
+ "b 202f\n"
+ "195:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x10, #0, 202f\n"
+ "str s11, [x28, #0x0]\n"
+ "str s15, [x23, #0x0]\n"
+ "str s19, [x22, #0x0]\n"
+ "str s23, [x21, #0x0]\n"
+ "str s27, [x20, #0x0]\n"
+ "str s31, [x19, #0x0]\n"
+ "b 202f\n"
+ "196:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x10, #1, 197f\n"
+ "str d10, [x28], #0x8\n"
+ "str d14, [x23], #0x8\n"
+ "str d18, [x22], #0x8\n"
+ "str d22, [x21], #0x8\n"
+ "str d26, [x20], #0x8\n"
+ "str d30, [x19], #0x8\n"
+ "tbz x10, #0, 202f\n"
+ "st1 { v10.s }[2], [x28]\n"
+ "st1 { v14.s }[2], [x23]\n"
+ "st1 { v18.s }[2], [x22]\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "st1 { v26.s }[2], [x20]\n"
+ "st1 { v30.s }[2], [x19]\n"
+ "b 202f\n"
+ "197:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x10, #0, 202f\n"
+ "str s10, [x28, #0x0]\n"
+ "str s14, [x23, #0x0]\n"
+ "str s18, [x22, #0x0]\n"
+ "str s22, [x21, #0x0]\n"
+ "str s26, [x20, #0x0]\n"
+ "str s30, [x19, #0x0]\n"
+ "b 202f\n"
+ "198:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x10, #2, 200f\n"
+ "st1 { v8.4s }, [x28], #0x10\n"
+ "st1 { v12.4s }, [x23], #0x10\n"
+ "st1 { v16.4s }, [x22], #0x10\n"
+ "st1 { v20.4s }, [x21], #0x10\n"
+ "st1 { v24.4s }, [x20], #0x10\n"
+ "st1 { v28.4s }, [x19], #0x10\n"
+ "tbz x10, #1, 199f\n"
+ "str d9, [x28], #0x8\n"
+ "str d13, [x23], #0x8\n"
+ "str d17, [x22], #0x8\n"
+ "str d21, [x21], #0x8\n"
+ "str d25, [x20], #0x8\n"
+ "str d29, [x19], #0x8\n"
+ "tbz x10, #0, 202f\n"
+ "st1 { v9.s }[2], [x28]\n"
+ "st1 { v13.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v25.s }[2], [x20]\n"
+ "st1 { v29.s }[2], [x19]\n"
+ "b 202f\n"
+ "199:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x10, #0, 202f\n"
+ "str s9, [x28, #0x0]\n"
+ "str s13, [x23, #0x0]\n"
+ "str s17, [x22, #0x0]\n"
+ "str s21, [x21, #0x0]\n"
+ "str s25, [x20, #0x0]\n"
+ "str s29, [x19, #0x0]\n"
+ "b 202f\n"
+ "200:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x10, #1, 201f\n"
+ "str d8, [x28], #0x8\n"
+ "str d12, [x23], #0x8\n"
+ "str d16, [x22], #0x8\n"
+ "str d20, [x21], #0x8\n"
+ "str d24, [x20], #0x8\n"
+ "str d28, [x19], #0x8\n"
+ "tbz x10, #0, 202f\n"
+ "st1 { v8.s }[2], [x28]\n"
+ "st1 { v12.s }[2], [x23]\n"
+ "st1 { v16.s }[2], [x22]\n"
+ "st1 { v20.s }[2], [x21]\n"
+ "st1 { v24.s }[2], [x20]\n"
+ "st1 { v28.s }[2], [x19]\n"
+ "b 202f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x28, #0x0]\n"
+ "str s12, [x23, #0x0]\n"
+ "str s16, [x22, #0x0]\n"
+ "str s20, [x21, #0x0]\n"
+ "str s24, [x20, #0x0]\n"
+ "str s28, [x19, #0x0]\n"
+ "202:" // Height 6: Partial direct writeback: Done
+ "b 204f\n"
+ "203:" // Height 6: Full writeback
+ "str q8, [x28, #0x0]\n"
+ "str q9, [x28, #0x10]\n"
+ "str q10, [x28, #0x20]\n"
+ "str q11, [x28, #0x30]\n"
+ "add x28, x28, #0x40\n"
+ "str q12, [x23, #0x0]\n"
+ "str q13, [x23, #0x10]\n"
+ "str q14, [x23, #0x20]\n"
+ "str q15, [x23, #0x30]\n"
+ "str q16, [x22, #0x0]\n"
+ "str q17, [x22, #0x10]\n"
+ "str q18, [x22, #0x20]\n"
+ "str q19, [x22, #0x30]\n"
+ "str q20, [x21, #0x0]\n"
+ "str q21, [x21, #0x10]\n"
+ "str q22, [x21, #0x20]\n"
+ "str q23, [x21, #0x30]\n"
+ "str q24, [x20, #0x0]\n"
+ "str q25, [x20, #0x10]\n"
+ "str q26, [x20, #0x20]\n"
+ "str q27, [x20, #0x30]\n"
+ "str q28, [x19, #0x0]\n"
+ "str q29, [x19, #0x10]\n"
+ "str q30, [x19, #0x20]\n"
+ "str q31, [x19, #0x30]\n"
+ "204:" // Height 6: Writeback done
+ "subs x10, x10, #0x10\n"
+ "bgt 172b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 212f\n"
+ "beq 206f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 211f\n"
+ "tbz %x[flags], #3, 205f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "211:" // Update direct input
+ "205:" // Update direct input
"mov x19, #0x6\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "212:" // Exit
+ "206:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index 63fca129ba..066bff4602 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -37,7 +37,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST );
@@ -74,7 +73,6 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL;
-
cls_sve_hybrid_bf16fp32_dot_6x4VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
index f5445e72e9..1233a98531 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -28,6 +28,7 @@
#include "../../bfloat.hpp"
#include <cassert>
+#include <limits>
namespace arm_gemm {
@@ -95,164 +96,158 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"ptrue p5.b\n"
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 71f\n"
+ "bge 66f\n"
"cmp %x[M], #0x4\n"
- "bgt 57f\n"
- "beq 43f\n"
+ "bgt 53f\n"
+ "beq 40f\n"
"cmp %x[M], #0x2\n"
- "bgt 29f\n"
- "beq 15f\n"
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[bias]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "bgt 27f\n"
+ "beq 14f\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[bias]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 4f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "b 6f\n"
- "4:" // Height 1: no bias
- "tbz %x[flags], #0, 5f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "b 6f\n"
- "5:" // Height 1: no accumulate
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 3f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "b 5f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 4f\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "b 5f\n"
+ "4:" // Height 1: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
"mov z11.b, #0x0\n"
- "6:" // Height 1: setup done
- "mov x12, #0x0\n"
- "7:" // Height 1: String loop
+ "5:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 8f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 7f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 9f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 8f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "b 9f\n"
- "8:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "9:" // Height 1: input setup done
- "cmp x11, #0x8\n"
- "ble 11f\n"
- "10:" // Height 1: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "b 8f\n"
+ "7:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "8:" // Height 1: input setup done
+ "cmp x26, #0x8\n"
+ "ble 10f\n"
+ "9:" // Height 1: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "cmp x11, #0x8\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "cmp x26, #0x8\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- "bgt 10b\n"
- "11:" // Height 1: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 9b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
- "12:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 7b\n"
- "tbz %x[flags], #1, 13f\n"
+ "cmp x27, x19\n"
+ "bne 6b\n"
+ "tbz %x[flags], #1, 12f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -265,63 +260,56 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"fmax z9.s, p5/M, z9.s, z1.s\n"
"fmax z10.s, p5/M, z10.s, z1.s\n"
"fmax z11.s, p5/M, z11.s, z1.s\n"
- "13:" // Height 1: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "14:" // Height 1: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 3b\n"
- "b 86f\n"
- "15:" // Height 2
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 16f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #2\n"
- "b 17f\n"
- "16:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "17:" // Height 2: Column loop
+ "12:" // Height 1: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "13:" // Height 1: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 2b\n"
+ "b 80f\n"
+ "14:" // Height 2
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "15:" // Height 2: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 18f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 16f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
"mov z13.d, z9.d\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
- "b 20f\n"
- "18:" // Height 2: no bias
- "tbz %x[flags], #0, 19f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "b 20f\n"
- "19:" // Height 2: no accumulate
+ "b 18f\n"
+ "16:" // Height 2: no bias
+ "tbz %x[flags], #0, 17f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "b 18f\n"
+ "17:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -330,160 +318,162 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
"mov z15.b, #0x0\n"
- "20:" // Height 2: setup done
- "mov x12, #0x0\n"
- "21:" // Height 2: String loop
+ "18:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 22f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 20f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 23f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 21f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "b 23f\n"
- "22:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "23:" // Height 2: input setup done
- "cmp x11, #0x8\n"
- "ble 25f\n"
- "24:" // Height 2: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 21f\n"
+ "20:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "21:" // Height 2: input setup done
+ "cmp x26, #0x8\n"
+ "ble 23f\n"
+ "22:" // Height 2: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "cmp x11, #0x8\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "cmp x26, #0x8\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- "bgt 24b\n"
- "25:" // Height 2: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 22b\n"
+ "23:" // Height 2: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
- "26:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "24:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 21b\n"
- "tbz %x[flags], #1, 27f\n"
+ "cmp x27, x19\n"
+ "bne 19b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "tbz %x[flags], #1, 25f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -504,79 +494,69 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"fmax z13.s, p5/M, z13.s, z1.s\n"
"fmax z14.s, p5/M, z14.s, z1.s\n"
"fmax z15.s, p5/M, z15.s, z1.s\n"
- "27:" // Height 2: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "28:" // Height 2: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 17b\n"
- "b 86f\n"
- "29:" // Height 3
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 30f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 31f\n"
- "30:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "31:" // Height 3: Column loop
+ "25:" // Height 2: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "26:" // Height 2: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 15b\n"
+ "b 80f\n"
+ "27:" // Height 3
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "28:" // Height 3: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 32f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 29f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
"mov z13.d, z9.d\n"
- "addvl x14, x14, #4\n"
+ "addvl x9, x9, #4\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
- "b 34f\n"
- "32:" // Height 3: no bias
- "tbz %x[flags], #0, 33f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "b 34f\n"
- "33:" // Height 3: no accumulate
+ "b 31f\n"
+ "29:" // Height 3: no bias
+ "tbz %x[flags], #0, 30f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 31f\n"
+ "30:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -589,201 +569,204 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov z17.b, #0x0\n"
"mov z18.b, #0x0\n"
"mov z19.b, #0x0\n"
- "34:" // Height 3: setup done
- "mov x12, #0x0\n"
- "35:" // Height 3: String loop
+ "31:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 36f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 33f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 37f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 34f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
- "b 37f\n"
- "36:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "37:" // Height 3: input setup done
- "cmp x11, #0x8\n"
- "ble 39f\n"
- "38:" // Height 3: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "b 34f\n"
+ "33:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "34:" // Height 3: input setup done
+ "cmp x26, #0x8\n"
+ "ble 36f\n"
+ "35:" // Height 3: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "cmp x11, #0x8\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "cmp x26, #0x8\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
- "bgt 38b\n"
- "39:" // Height 3: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 35b\n"
+ "36:" // Height 3: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "add x26, x26, #0x10\n"
+ "add x23, x23, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
- "40:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "37:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 35b\n"
- "tbz %x[flags], #1, 41f\n"
+ "cmp x27, x19\n"
+ "bne 32b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "tbz %x[flags], #1, 38f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -812,65 +795,48 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"fmax z17.s, p5/M, z17.s, z1.s\n"
"fmax z18.s, p5/M, z18.s, z1.s\n"
"fmax z19.s, p5/M, z19.s, z1.s\n"
- "41:" // Height 3: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "42:" // Height 3: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 31b\n"
- "b 86f\n"
- "43:" // Height 4
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 44f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 45f\n"
- "44:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "45:" // Height 4: Column loop
+ "38:" // Height 3: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "39:" // Height 3: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 28b\n"
+ "b 80f\n"
+ "40:" // Height 4
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "41:" // Height 4: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 46f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 42f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
"mov z20.d, z8.d\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z13.d, z9.d\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
@@ -880,27 +846,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
- "b 48f\n"
- "46:" // Height 4: no bias
- "tbz %x[flags], #0, 47f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "b 48f\n"
- "47:" // Height 4: no accumulate
+ "b 44f\n"
+ "42:" // Height 4: no bias
+ "tbz %x[flags], #0, 43f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "b 44f\n"
+ "43:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -917,123 +887,123 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov z21.b, #0x0\n"
"mov z22.b, #0x0\n"
"mov z23.b, #0x0\n"
- "48:" // Height 4: setup done
- "mov x12, #0x0\n"
- "49:" // Height 4: String loop
+ "44:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 50f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 46f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 51f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 47f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
- "b 51f\n"
- "50:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "51:" // Height 4: input setup done
- "cmp x11, #0x8\n"
- "ble 53f\n"
- "52:" // Height 4: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x23, x23, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 47f\n"
+ "46:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "47:" // Height 4: input setup done
+ "cmp x26, #0x8\n"
+ "ble 49f\n"
+ "48:" // Height 4: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x8\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x8\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
@@ -1042,31 +1012,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
- "bgt 52b\n"
- "53:" // Height 4: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 48b\n"
+ "49:" // Height 4: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "add x24, x24, #0x10\n"
+ "add x22, x22, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
@@ -1075,21 +1045,21 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
@@ -1098,21 +1068,21 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
@@ -1121,20 +1091,20 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
@@ -1143,16 +1113,20 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
- "54:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "50:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 49b\n"
- "tbz %x[flags], #1, 55f\n"
+ "cmp x27, x19\n"
+ "bne 45b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "tbz %x[flags], #1, 51f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1189,73 +1163,52 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"fmax z21.s, p5/M, z21.s, z1.s\n"
"fmax z22.s, p5/M, z22.s, z1.s\n"
"fmax z23.s, p5/M, z23.s, z1.s\n"
- "55:" // Height 4: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "56:" // Height 4: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 45b\n"
- "b 86f\n"
- "57:" // Height 5
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 58f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 59f\n"
- "58:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "59:" // Height 5: Column loop
+ "51:" // Height 4: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x22]\n"
+ "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+ "52:" // Height 4: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 41b\n"
+ "b 80f\n"
+ "53:" // Height 5
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "54:" // Height 5: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 60f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 55f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
"mov z20.d, z8.d\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z13.d, z9.d\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
@@ -1269,31 +1222,36 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov z25.d, z9.d\n"
"mov z26.d, z10.d\n"
"mov z27.d, z11.d\n"
- "b 62f\n"
- "60:" // Height 5: no bias
- "tbz %x[flags], #0, 61f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x23]\n"
- "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
- "b 62f\n"
- "61:" // Height 5: no accumulate
+ "b 57f\n"
+ "55:" // Height 5: no bias
+ "tbz %x[flags], #0, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x21]\n"
+ "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 57f\n"
+ "56:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -1314,143 +1272,143 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
- "62:" // Height 5: setup done
- "mov x12, #0x0\n"
- "63:" // Height 5: String loop
+ "57:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 64f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 59f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 65f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 60f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
"add x22, x22, x19, LSL #1\n"
- "b 65f\n"
- "64:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "add x22, x24, x19, LSL #1\n"
- "65:" // Height 5: input setup done
- "cmp x11, #0x8\n"
- "ble 67f\n"
- "66:" // Height 5: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x21, x21, x19, LSL #1\n"
+ "b 60f\n"
+ "59:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "60:" // Height 5: input setup done
+ "cmp x26, #0x8\n"
+ "ble 62f\n"
+ "61:" // Height 5: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1rqh { z4.h }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x21, x21, #0x10\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x8\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x8\n"
".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
@@ -1461,35 +1419,35 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
- "bgt 66b\n"
- "67:" // Height 5: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 61b\n"
+ "62:" // Height 5: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "add x21, x21, #0x10\n"
".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
@@ -1500,23 +1458,23 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
@@ -1527,23 +1485,23 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
@@ -1554,22 +1512,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
@@ -1580,17 +1538,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
- "68:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "63:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 63b\n"
- "tbz %x[flags], #1, 69f\n"
+ "cmp x27, x19\n"
+ "bne 58b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "tbz %x[flags], #1, 64f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1635,83 +1598,59 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"fmax z25.s, p5/M, z25.s, z1.s\n"
"fmax z26.s, p5/M, z26.s, z1.s\n"
"fmax z27.s, p5/M, z27.s, z1.s\n"
- "69:" // Height 5: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "70:" // Height 5: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 59b\n"
- "b 86f\n"
- "71:" // Height 6
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 72f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 73f\n"
- "72:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "73:" // Height 6: Column loop
+ "64:" // Height 5: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x22]\n"
+ "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x21]\n"
+ "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+ "65:" // Height 5: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 54b\n"
+ "b 80f\n"
+ "66:" // Height 6
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0x18\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "67:" // Height 6: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 74f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 68f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
"mov z20.d, z8.d\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z13.d, z9.d\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
@@ -1729,35 +1668,41 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov z29.d, z9.d\n"
"mov z30.d, z10.d\n"
"mov z31.d, z11.d\n"
- "b 76f\n"
- "74:" // Height 6: no bias
- "tbz %x[flags], #0, 75f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x23]\n"
- "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
- "b 76f\n"
- "75:" // Height 6: no accumulate
+ "b 70f\n"
+ "68:" // Height 6: no bias
+ "tbz %x[flags], #0, 69f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x21]\n"
+ "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "b 70f\n"
+ "69:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -1782,77 +1727,77 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"mov z29.b, #0x0\n"
"mov z30.b, #0x0\n"
"mov z31.b, #0x0\n"
- "76:" // Height 6: setup done
- "mov x12, #0x0\n"
- "77:" // Height 6: String loop
+ "70:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 78f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 72f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 79f\n"
+ "cbnz x27, 73f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
"add x22, x22, x19, LSL #1\n"
+ "add x21, x21, x19, LSL #1\n"
"add x20, x20, x19, LSL #1\n"
- "b 79f\n"
- "78:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "add x22, x24, x19, LSL #1\n"
- "add x20, x22, x19, LSL #1\n"
- "79:" // Height 6: input setup done
- "cmp x11, #0x8\n"
- "ble 81f\n"
- "80:" // Height 6: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "add x20, x21, x19, LSL #1\n"
+ "73:" // Height 6: input setup done
+ "cmp x26, #0x8\n"
+ "ble 75f\n"
+ "74:" // Height 6: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqh { z4.h }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
"ld1rqh { z5.h }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"add x20, x20, #0x10\n"
".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x8\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x8\n"
".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
@@ -1860,85 +1805,85 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
@@ -1951,39 +1896,39 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n"
- "bgt 80b\n"
- "81:" // Height 6: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 74b\n"
+ "75:" // Height 6: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqh { z4.h }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
"ld1rqh { z5.h }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
"add x20, x20, #0x10\n"
".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
@@ -1996,25 +1941,25 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
@@ -2027,25 +1972,25 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x2\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x2\n"
".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
@@ -2058,24 +2003,24 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
@@ -2088,18 +2033,24 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n"
- "82:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "76:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 77b\n"
- "tbz %x[flags], #1, 83f\n"
+ "cmp x27, x19\n"
+ "bne 71b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "tbz %x[flags], #1, 77f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -2152,57 +2103,52 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
"fmax z29.s, p5/M, z29.s, z1.s\n"
"fmax z30.s, p5/M, z30.s, z1.s\n"
"fmax z31.s, p5/M, z31.s, z1.s\n"
- "83:" // Height 6: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "st1w { z28.s }, p4, [x21]\n"
- "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #4\n"
- "84:" // Height 6: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 73b\n"
+ "77:" // Height 6: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x22]\n"
+ "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x21]\n"
+ "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z28.s }, p4, [x20]\n"
+ "st1w { z29.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x20, #3, MUL VL]\n"
+ "78:" // Height 6: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 67b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 86f\n"
+ "beq 80f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 85f\n"
+ "tbz %x[flags], #3, 79f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "85:" // Update direct input
+ "79:" // Update direct input
"mov x19, #0xc\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "86:" // Exit
+ "80:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index aa74ce9a73..5c8563952f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -36,7 +36,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void sve_hybrid_fp16_mla_6x4VL( ARGLIST );
@@ -73,7 +72,6 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_fp16_mla_6x4VL;
-
cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
index bb42dc0e04..7cc03bbfb5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -27,6 +27,7 @@
#include "../../utils.hpp"
#include <cassert>
+#include <limits>
namespace arm_gemm {
@@ -94,241 +95,235 @@ void sve_hybrid_fp16_mla_6x4VL (
"ptrue p5.b\n"
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 71f\n"
+ "bge 66f\n"
"cmp %x[M], #0x4\n"
- "bgt 57f\n"
- "beq 43f\n"
+ "bgt 53f\n"
+ "beq 40f\n"
"cmp %x[M], #0x2\n"
- "bgt 29f\n"
- "beq 15f\n"
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[bias]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "bgt 27f\n"
+ "beq 14f\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[bias]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"mov x19, #0x0\n"
- "whilelt p4.h, x19, x16\n"
+ "whilelt p4.h, x19, x11\n"
"inch x19\n"
- "whilelt p3.h, x19, x16\n"
+ "whilelt p3.h, x19, x11\n"
"inch x19\n"
- "whilelt p2.h, x19, x16\n"
+ "whilelt p2.h, x19, x11\n"
"inch x19\n"
- "whilelt p1.h, x19, x16\n"
- "cbz x14, 4f\n"
- "ld1h { z8.h }, p5/Z, [x14]\n"
- "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "b 6f\n"
- "4:" // Height 1: no bias
- "tbz %x[flags], #0, 5f\n"
- "ld1h { z8.h }, p4/Z, [x13]\n"
- "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "b 6f\n"
- "5:" // Height 1: no accumulate
+ "whilelt p1.h, x19, x11\n"
+ "cbz x9, 3f\n"
+ "ld1h { z8.h }, p5/Z, [x9]\n"
+ "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "b 5f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 4f\n"
+ "ld1h { z8.h }, p4/Z, [x28]\n"
+ "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
+ "b 5f\n"
+ "4:" // Height 1: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
"mov z11.b, #0x0\n"
- "6:" // Height 1: setup done
- "mov x12, #0x0\n"
- "7:" // Height 1: String loop
+ "5:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 8f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 7f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 9f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 8f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "b 9f\n"
- "8:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "9:" // Height 1: input setup done
- "cmp x11, #0x8\n"
- "ble 11f\n"
- "10:" // Height 1: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "b 8f\n"
+ "7:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "8:" // Height 1: input setup done
+ "cmp x26, #0x8\n"
+ "ble 10f\n"
+ "9:" // Height 1: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "cmp x11, #0x8\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "cmp x26, #0x8\n"
"fmla z10.h, z6.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla z11.h, z7.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z11.h, z7.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z11.h, z7.h, z0.h[7]\n"
- "bgt 10b\n"
- "11:" // Height 1: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 9b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"fmla z11.h, z7.h, z0.h[0]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z9.h, z7.h, z0.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[1]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z7.h, z0.h[1]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z9.h, z7.h, z0.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[2]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z7.h, z0.h[2]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z9.h, z7.h, z0.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[3]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z7.h, z0.h[3]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z9.h, z7.h, z0.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[4]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z7.h, z0.h[4]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z9.h, z7.h, z0.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[5]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z7.h, z0.h[5]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z9.h, z7.h, z0.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[6]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
"fmla z11.h, z7.h, z0.h[6]\n"
- "ble 12f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z11.h, z7.h, z0.h[7]\n"
- "12:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 7b\n"
- "tbz %x[flags], #1, 13f\n"
+ "cmp x27, x19\n"
+ "bne 6b\n"
+ "tbz %x[flags], #1, 12f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rh { z1.h }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -341,63 +336,56 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmax z9.h, p5/M, z9.h, z1.h\n"
"fmax z10.h, p5/M, z10.h, z1.h\n"
"fmax z11.h, p5/M, z11.h, z1.h\n"
- "13:" // Height 1: No activation
- "st1h { z8.h }, p4, [x13]\n"
- "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
- "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
- "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "14:" // Height 1: Writeback done
- "dech x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 3b\n"
- "b 86f\n"
- "15:" // Height 2
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 16f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #1\n"
- "b 17f\n"
- "16:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "17:" // Height 2: Column loop
+ "12:" // Height 1: No activation
+ "st1h { z8.h }, p4, [x28]\n"
+ "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "13:" // Height 1: Writeback done
+ "dech x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 2b\n"
+ "b 80f\n"
+ "14:" // Height 2
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "15:" // Height 2: Column loop
"mov x19, #0x0\n"
- "whilelt p4.h, x19, x16\n"
+ "whilelt p4.h, x19, x11\n"
"inch x19\n"
- "whilelt p3.h, x19, x16\n"
+ "whilelt p3.h, x19, x11\n"
"inch x19\n"
- "whilelt p2.h, x19, x16\n"
+ "whilelt p2.h, x19, x11\n"
"inch x19\n"
- "whilelt p1.h, x19, x16\n"
- "cbz x14, 18f\n"
- "ld1h { z8.h }, p5/Z, [x14]\n"
+ "whilelt p1.h, x19, x11\n"
+ "cbz x9, 16f\n"
+ "ld1h { z8.h }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
- "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
"mov z13.d, z9.d\n"
- "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
- "b 20f\n"
- "18:" // Height 2: no bias
- "tbz %x[flags], #0, 19f\n"
- "ld1h { z8.h }, p4/Z, [x13]\n"
- "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x9]\n"
- "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
- "b 20f\n"
- "19:" // Height 2: no accumulate
+ "b 18f\n"
+ "16:" // Height 2: no bias
+ "tbz %x[flags], #0, 17f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1h { z8.h }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x24]\n"
+ "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "b 18f\n"
+ "17:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -406,269 +394,271 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
"mov z15.b, #0x0\n"
- "20:" // Height 2: setup done
- "mov x12, #0x0\n"
- "21:" // Height 2: String loop
+ "18:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 22f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 20f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 23f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 21f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "b 23f\n"
- "22:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "23:" // Height 2: input setup done
- "cmp x11, #0x8\n"
- "ble 25f\n"
- "24:" // Height 2: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 21f\n"
+ "20:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "21:" // Height 2: input setup done
+ "cmp x26, #0x8\n"
+ "ble 23f\n"
+ "22:" // Height 2: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "cmp x11, #0x8\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "cmp x26, #0x8\n"
"fmla z13.h, z7.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[0]\n"
"fmla z15.h, z7.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
"fmla z12.h, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[1]\n"
"fmla z15.h, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
"fmla z12.h, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[2]\n"
"fmla z15.h, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
"fmla z12.h, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z11.h, z7.h, z0.h[3]\n"
"fmla z15.h, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
"fmla z12.h, z6.h, z1.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[4]\n"
"fmla z15.h, z7.h, z1.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
"fmla z12.h, z6.h, z1.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[5]\n"
"fmla z15.h, z7.h, z1.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
"fmla z12.h, z6.h, z1.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[6]\n"
"fmla z15.h, z7.h, z1.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z11.h, z7.h, z0.h[7]\n"
"fmla z15.h, z7.h, z1.h[7]\n"
- "bgt 24b\n"
- "25:" // Height 2: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 22b\n"
+ "23:" // Height 2: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.h, z7.h, z1.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
"fmla z11.h, z7.h, z0.h[0]\n"
"fmla z15.h, z7.h, z1.h[0]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
"fmla z11.h, z7.h, z0.h[1]\n"
"fmla z15.h, z7.h, z1.h[1]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
"fmla z11.h, z7.h, z0.h[2]\n"
"fmla z15.h, z7.h, z1.h[2]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
"fmla z11.h, z7.h, z0.h[3]\n"
"fmla z15.h, z7.h, z1.h[3]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
"fmla z11.h, z7.h, z0.h[4]\n"
"fmla z15.h, z7.h, z1.h[4]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
"fmla z11.h, z7.h, z0.h[5]\n"
"fmla z15.h, z7.h, z1.h[5]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
"fmla z11.h, z7.h, z0.h[6]\n"
"fmla z15.h, z7.h, z1.h[6]\n"
- "ble 26f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z11.h, z7.h, z0.h[7]\n"
"fmla z15.h, z7.h, z1.h[7]\n"
- "26:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "24:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 21b\n"
- "tbz %x[flags], #1, 27f\n"
+ "cmp x27, x19\n"
+ "bne 19b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "tbz %x[flags], #1, 25f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rh { z1.h }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -689,79 +679,69 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmax z13.h, p5/M, z13.h, z1.h\n"
"fmax z14.h, p5/M, z14.h, z1.h\n"
"fmax z15.h, p5/M, z15.h, z1.h\n"
- "27:" // Height 2: No activation
- "st1h { z8.h }, p4, [x13]\n"
- "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
- "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
- "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x9]\n"
- "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "28:" // Height 2: Writeback done
- "dech x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 17b\n"
- "b 86f\n"
- "29:" // Height 3
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 30f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #1\n"
- "add x27, x27, x19, LSL #1\n"
- "b 31f\n"
- "30:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "add x27, x9, x19, LSL #1\n"
- "31:" // Height 3: Column loop
+ "25:" // Height 2: No activation
+ "st1h { z8.h }, p4, [x28]\n"
+ "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1h { z12.h }, p4, [x24]\n"
+ "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
+ "26:" // Height 2: Writeback done
+ "dech x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 15b\n"
+ "b 80f\n"
+ "27:" // Height 3
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "28:" // Height 3: Column loop
"mov x19, #0x0\n"
- "whilelt p4.h, x19, x16\n"
+ "whilelt p4.h, x19, x11\n"
"inch x19\n"
- "whilelt p3.h, x19, x16\n"
+ "whilelt p3.h, x19, x11\n"
"inch x19\n"
- "whilelt p2.h, x19, x16\n"
+ "whilelt p2.h, x19, x11\n"
"inch x19\n"
- "whilelt p1.h, x19, x16\n"
- "cbz x14, 32f\n"
- "ld1h { z8.h }, p5/Z, [x14]\n"
+ "whilelt p1.h, x19, x11\n"
+ "cbz x9, 29f\n"
+ "ld1h { z8.h }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
"mov z13.d, z9.d\n"
- "addvl x14, x14, #4\n"
+ "addvl x9, x9, #4\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
- "b 34f\n"
- "32:" // Height 3: no bias
- "tbz %x[flags], #0, 33f\n"
- "ld1h { z8.h }, p4/Z, [x13]\n"
- "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x9]\n"
- "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x27]\n"
- "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
- "b 34f\n"
- "33:" // Height 3: no accumulate
+ "b 31f\n"
+ "29:" // Height 3: no bias
+ "tbz %x[flags], #0, 30f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1h { z8.h }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #1\n"
+ "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x24]\n"
+ "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x23]\n"
+ "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 31f\n"
+ "30:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -774,342 +754,345 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov z17.b, #0x0\n"
"mov z18.b, #0x0\n"
"mov z19.b, #0x0\n"
- "34:" // Height 3: setup done
- "mov x12, #0x0\n"
- "35:" // Height 3: String loop
+ "31:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 36f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 33f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 37f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 34f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
- "b 37f\n"
- "36:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "37:" // Height 3: input setup done
- "cmp x11, #0x8\n"
- "ble 39f\n"
- "38:" // Height 3: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "b 34f\n"
+ "33:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "34:" // Height 3: input setup done
+ "cmp x26, #0x8\n"
+ "ble 36f\n"
+ "35:" // Height 3: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.h, z6.h, z2.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
- "cmp x11, #0x8\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+ "cmp x26, #0x8\n"
"fmla z13.h, z7.h, z1.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla z17.h, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
"fmla z18.h, z6.h, z2.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[0]\n"
"fmla z15.h, z7.h, z1.h[0]\n"
"fmla z19.h, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
"fmla z12.h, z6.h, z1.h[1]\n"
"fmla z16.h, z6.h, z2.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
"fmla z17.h, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z2.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[1]\n"
"fmla z15.h, z7.h, z1.h[1]\n"
"fmla z19.h, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
"fmla z12.h, z6.h, z1.h[2]\n"
"fmla z16.h, z6.h, z2.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
"fmla z17.h, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
"fmla z18.h, z6.h, z2.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[2]\n"
"fmla z15.h, z7.h, z1.h[2]\n"
"fmla z19.h, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
"fmla z12.h, z6.h, z1.h[3]\n"
"fmla z16.h, z6.h, z2.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
"fmla z17.h, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
"fmla z18.h, z6.h, z2.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z11.h, z7.h, z0.h[3]\n"
"fmla z15.h, z7.h, z1.h[3]\n"
"fmla z19.h, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
"fmla z12.h, z6.h, z1.h[4]\n"
"fmla z16.h, z6.h, z2.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
"fmla z17.h, z7.h, z2.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
"fmla z18.h, z6.h, z2.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[4]\n"
"fmla z15.h, z7.h, z1.h[4]\n"
"fmla z19.h, z7.h, z2.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
"fmla z12.h, z6.h, z1.h[5]\n"
"fmla z16.h, z6.h, z2.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
"fmla z17.h, z7.h, z2.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
"fmla z18.h, z6.h, z2.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[5]\n"
"fmla z15.h, z7.h, z1.h[5]\n"
"fmla z19.h, z7.h, z2.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
"fmla z12.h, z6.h, z1.h[6]\n"
"fmla z16.h, z6.h, z2.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
"fmla z17.h, z7.h, z2.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
"fmla z18.h, z6.h, z2.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[6]\n"
"fmla z15.h, z7.h, z1.h[6]\n"
"fmla z19.h, z7.h, z2.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
"fmla z16.h, z6.h, z2.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
"fmla z17.h, z7.h, z2.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z18.h, z6.h, z2.h[7]\n"
"fmla z11.h, z7.h, z0.h[7]\n"
"fmla z15.h, z7.h, z1.h[7]\n"
"fmla z19.h, z7.h, z2.h[7]\n"
- "bgt 38b\n"
- "39:" // Height 3: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 35b\n"
+ "36:" // Height 3: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "add x26, x26, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z13.h, z7.h, z1.h[0]\n"
"fmla z16.h, z6.h, z2.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z17.h, z7.h, z2.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
"fmla z18.h, z6.h, z2.h[0]\n"
"fmla z11.h, z7.h, z0.h[0]\n"
"fmla z15.h, z7.h, z1.h[0]\n"
"fmla z19.h, z7.h, z2.h[0]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[1]\n"
"fmla z16.h, z6.h, z2.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
"fmla z17.h, z7.h, z2.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z2.h[1]\n"
"fmla z11.h, z7.h, z0.h[1]\n"
"fmla z15.h, z7.h, z1.h[1]\n"
"fmla z19.h, z7.h, z2.h[1]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[2]\n"
"fmla z16.h, z6.h, z2.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
"fmla z17.h, z7.h, z2.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
"fmla z18.h, z6.h, z2.h[2]\n"
"fmla z11.h, z7.h, z0.h[2]\n"
"fmla z15.h, z7.h, z1.h[2]\n"
"fmla z19.h, z7.h, z2.h[2]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[3]\n"
"fmla z16.h, z6.h, z2.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
"fmla z17.h, z7.h, z2.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
"fmla z18.h, z6.h, z2.h[3]\n"
"fmla z11.h, z7.h, z0.h[3]\n"
"fmla z15.h, z7.h, z1.h[3]\n"
"fmla z19.h, z7.h, z2.h[3]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[4]\n"
"fmla z16.h, z6.h, z2.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
"fmla z17.h, z7.h, z2.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
"fmla z18.h, z6.h, z2.h[4]\n"
"fmla z11.h, z7.h, z0.h[4]\n"
"fmla z15.h, z7.h, z1.h[4]\n"
"fmla z19.h, z7.h, z2.h[4]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[5]\n"
"fmla z16.h, z6.h, z2.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
"fmla z17.h, z7.h, z2.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
"fmla z18.h, z6.h, z2.h[5]\n"
"fmla z11.h, z7.h, z0.h[5]\n"
"fmla z15.h, z7.h, z1.h[5]\n"
"fmla z19.h, z7.h, z2.h[5]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[6]\n"
"fmla z16.h, z6.h, z2.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
"fmla z17.h, z7.h, z2.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
"fmla z18.h, z6.h, z2.h[6]\n"
"fmla z11.h, z7.h, z0.h[6]\n"
"fmla z15.h, z7.h, z1.h[6]\n"
"fmla z19.h, z7.h, z2.h[6]\n"
- "ble 40f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
"fmla z16.h, z6.h, z2.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
"fmla z17.h, z7.h, z2.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z18.h, z6.h, z2.h[7]\n"
"fmla z11.h, z7.h, z0.h[7]\n"
"fmla z15.h, z7.h, z1.h[7]\n"
"fmla z19.h, z7.h, z2.h[7]\n"
- "40:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "37:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 35b\n"
- "tbz %x[flags], #1, 41f\n"
+ "cmp x27, x19\n"
+ "bne 32b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "tbz %x[flags], #1, 38f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rh { z1.h }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1138,65 +1121,48 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmax z17.h, p5/M, z17.h, z1.h\n"
"fmax z18.h, p5/M, z18.h, z1.h\n"
"fmax z19.h, p5/M, z19.h, z1.h\n"
- "41:" // Height 3: No activation
- "st1h { z8.h }, p4, [x13]\n"
- "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
- "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
- "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x9]\n"
- "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1h { z16.h }, p4, [x27]\n"
- "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "42:" // Height 3: Writeback done
- "dech x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 31b\n"
- "b 86f\n"
- "43:" // Height 4
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 44f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #1\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #1\n"
- "add x25, x25, x19, LSL #1\n"
- "b 45f\n"
- "44:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "add x27, x9, x19, LSL #1\n"
- "add x25, x27, x19, LSL #1\n"
- "45:" // Height 4: Column loop
+ "38:" // Height 3: No activation
+ "st1h { z8.h }, p4, [x28]\n"
+ "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1h { z12.h }, p4, [x24]\n"
+ "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x23]\n"
+ "st1h { z17.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x23, #3, MUL VL]\n"
+ "39:" // Height 3: Writeback done
+ "dech x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 28b\n"
+ "b 80f\n"
+ "40:" // Height 4
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "41:" // Height 4: Column loop
"mov x19, #0x0\n"
- "whilelt p4.h, x19, x16\n"
+ "whilelt p4.h, x19, x11\n"
"inch x19\n"
- "whilelt p3.h, x19, x16\n"
+ "whilelt p3.h, x19, x11\n"
"inch x19\n"
- "whilelt p2.h, x19, x16\n"
+ "whilelt p2.h, x19, x11\n"
"inch x19\n"
- "whilelt p1.h, x19, x16\n"
- "cbz x14, 46f\n"
- "ld1h { z8.h }, p5/Z, [x14]\n"
+ "whilelt p1.h, x19, x11\n"
+ "cbz x9, 42f\n"
+ "ld1h { z8.h }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
"mov z20.d, z8.d\n"
- "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z13.d, z9.d\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
@@ -1206,27 +1172,31 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
- "b 48f\n"
- "46:" // Height 4: no bias
- "tbz %x[flags], #0, 47f\n"
- "ld1h { z8.h }, p4/Z, [x13]\n"
- "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x9]\n"
- "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x27]\n"
- "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x25]\n"
- "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
- "b 48f\n"
- "47:" // Height 4: no accumulate
+ "b 44f\n"
+ "42:" // Height 4: no bias
+ "tbz %x[flags], #0, 43f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1h { z8.h }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #1\n"
+ "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x22, x23, x19, LSL #1\n"
+ "ld1h { z12.h }, p4/Z, [x24]\n"
+ "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x23]\n"
+ "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x22]\n"
+ "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "b 44f\n"
+ "43:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -1243,204 +1213,204 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov z21.b, #0x0\n"
"mov z22.b, #0x0\n"
"mov z23.b, #0x0\n"
- "48:" // Height 4: setup done
- "mov x12, #0x0\n"
- "49:" // Height 4: String loop
+ "44:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 50f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 46f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 51f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 47f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
- "b 51f\n"
- "50:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "51:" // Height 4: input setup done
- "cmp x11, #0x8\n"
- "ble 53f\n"
- "52:" // Height 4: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x23, x23, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 47f\n"
+ "46:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "47:" // Height 4: input setup done
+ "cmp x26, #0x8\n"
+ "ble 49f\n"
+ "48:" // Height 4: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.h, z6.h, z2.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla z13.h, z7.h, z1.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x8\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x8\n"
"fmla z20.h, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z17.h, z7.h, z2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z21.h, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
"fmla z18.h, z6.h, z2.h[0]\n"
"fmla z22.h, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[0]\n"
"fmla z15.h, z7.h, z1.h[0]\n"
"fmla z19.h, z7.h, z2.h[0]\n"
"fmla z23.h, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
"fmla z12.h, z6.h, z1.h[1]\n"
"fmla z16.h, z6.h, z2.h[1]\n"
"fmla z20.h, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
"fmla z17.h, z7.h, z2.h[1]\n"
"fmla z21.h, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z2.h[1]\n"
"fmla z22.h, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[1]\n"
"fmla z15.h, z7.h, z1.h[1]\n"
"fmla z19.h, z7.h, z2.h[1]\n"
"fmla z23.h, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
"fmla z12.h, z6.h, z1.h[2]\n"
"fmla z16.h, z6.h, z2.h[2]\n"
"fmla z20.h, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
"fmla z17.h, z7.h, z2.h[2]\n"
"fmla z21.h, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
"fmla z18.h, z6.h, z2.h[2]\n"
"fmla z22.h, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[2]\n"
"fmla z15.h, z7.h, z1.h[2]\n"
"fmla z19.h, z7.h, z2.h[2]\n"
"fmla z23.h, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
"fmla z12.h, z6.h, z1.h[3]\n"
"fmla z16.h, z6.h, z2.h[3]\n"
"fmla z20.h, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
"fmla z17.h, z7.h, z2.h[3]\n"
"fmla z21.h, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
"fmla z18.h, z6.h, z2.h[3]\n"
"fmla z22.h, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z11.h, z7.h, z0.h[3]\n"
"fmla z15.h, z7.h, z1.h[3]\n"
"fmla z19.h, z7.h, z2.h[3]\n"
"fmla z23.h, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
"fmla z12.h, z6.h, z1.h[4]\n"
"fmla z16.h, z6.h, z2.h[4]\n"
"fmla z20.h, z6.h, z3.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
"fmla z17.h, z7.h, z2.h[4]\n"
"fmla z21.h, z7.h, z3.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
"fmla z18.h, z6.h, z2.h[4]\n"
"fmla z22.h, z6.h, z3.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[4]\n"
"fmla z15.h, z7.h, z1.h[4]\n"
"fmla z19.h, z7.h, z2.h[4]\n"
"fmla z23.h, z7.h, z3.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
"fmla z12.h, z6.h, z1.h[5]\n"
"fmla z16.h, z6.h, z2.h[5]\n"
"fmla z20.h, z6.h, z3.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
"fmla z17.h, z7.h, z2.h[5]\n"
"fmla z21.h, z7.h, z3.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
"fmla z18.h, z6.h, z2.h[5]\n"
"fmla z22.h, z6.h, z3.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[5]\n"
"fmla z15.h, z7.h, z1.h[5]\n"
"fmla z19.h, z7.h, z2.h[5]\n"
"fmla z23.h, z7.h, z3.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
"fmla z12.h, z6.h, z1.h[6]\n"
"fmla z16.h, z6.h, z2.h[6]\n"
"fmla z20.h, z6.h, z3.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
"fmla z17.h, z7.h, z2.h[6]\n"
"fmla z21.h, z7.h, z3.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
"fmla z18.h, z6.h, z2.h[6]\n"
"fmla z22.h, z6.h, z3.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[6]\n"
"fmla z15.h, z7.h, z1.h[6]\n"
"fmla z19.h, z7.h, z2.h[6]\n"
"fmla z23.h, z7.h, z3.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
"fmla z16.h, z6.h, z2.h[7]\n"
"fmla z20.h, z6.h, z3.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
"fmla z17.h, z7.h, z2.h[7]\n"
"fmla z21.h, z7.h, z3.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z18.h, z6.h, z2.h[7]\n"
@@ -1449,31 +1419,31 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z15.h, z7.h, z1.h[7]\n"
"fmla z19.h, z7.h, z2.h[7]\n"
"fmla z23.h, z7.h, z3.h[7]\n"
- "bgt 52b\n"
- "53:" // Height 4: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 48b\n"
+ "49:" // Height 4: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.h, z6.h, z2.h[0]\n"
- "add x24, x24, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z13.h, z7.h, z1.h[0]\n"
"fmla z17.h, z7.h, z2.h[0]\n"
"fmla z20.h, z6.h, z3.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z21.h, z7.h, z3.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
"fmla z18.h, z6.h, z2.h[0]\n"
@@ -1482,21 +1452,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z15.h, z7.h, z1.h[0]\n"
"fmla z19.h, z7.h, z2.h[0]\n"
"fmla z23.h, z7.h, z3.h[0]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[1]\n"
"fmla z16.h, z6.h, z2.h[1]\n"
"fmla z20.h, z6.h, z3.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
"fmla z17.h, z7.h, z2.h[1]\n"
"fmla z21.h, z7.h, z3.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z2.h[1]\n"
@@ -1505,21 +1475,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z15.h, z7.h, z1.h[1]\n"
"fmla z19.h, z7.h, z2.h[1]\n"
"fmla z23.h, z7.h, z3.h[1]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[2]\n"
"fmla z16.h, z6.h, z2.h[2]\n"
"fmla z20.h, z6.h, z3.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
"fmla z17.h, z7.h, z2.h[2]\n"
"fmla z21.h, z7.h, z3.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
"fmla z18.h, z6.h, z2.h[2]\n"
@@ -1528,21 +1498,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z15.h, z7.h, z1.h[2]\n"
"fmla z19.h, z7.h, z2.h[2]\n"
"fmla z23.h, z7.h, z3.h[2]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[3]\n"
"fmla z16.h, z6.h, z2.h[3]\n"
"fmla z20.h, z6.h, z3.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
"fmla z17.h, z7.h, z2.h[3]\n"
"fmla z21.h, z7.h, z3.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
"fmla z18.h, z6.h, z2.h[3]\n"
@@ -1551,21 +1521,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z15.h, z7.h, z1.h[3]\n"
"fmla z19.h, z7.h, z2.h[3]\n"
"fmla z23.h, z7.h, z3.h[3]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[4]\n"
"fmla z16.h, z6.h, z2.h[4]\n"
"fmla z20.h, z6.h, z3.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
"fmla z17.h, z7.h, z2.h[4]\n"
"fmla z21.h, z7.h, z3.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
"fmla z18.h, z6.h, z2.h[4]\n"
@@ -1574,21 +1544,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z15.h, z7.h, z1.h[4]\n"
"fmla z19.h, z7.h, z2.h[4]\n"
"fmla z23.h, z7.h, z3.h[4]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[5]\n"
"fmla z16.h, z6.h, z2.h[5]\n"
"fmla z20.h, z6.h, z3.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
"fmla z17.h, z7.h, z2.h[5]\n"
"fmla z21.h, z7.h, z3.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
"fmla z18.h, z6.h, z2.h[5]\n"
@@ -1597,21 +1567,21 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z15.h, z7.h, z1.h[5]\n"
"fmla z19.h, z7.h, z2.h[5]\n"
"fmla z23.h, z7.h, z3.h[5]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[6]\n"
"fmla z16.h, z6.h, z2.h[6]\n"
"fmla z20.h, z6.h, z3.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
"fmla z17.h, z7.h, z2.h[6]\n"
"fmla z21.h, z7.h, z3.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
"fmla z18.h, z6.h, z2.h[6]\n"
@@ -1620,20 +1590,20 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z15.h, z7.h, z1.h[6]\n"
"fmla z19.h, z7.h, z2.h[6]\n"
"fmla z23.h, z7.h, z3.h[6]\n"
- "ble 54f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
"fmla z16.h, z6.h, z2.h[7]\n"
"fmla z20.h, z6.h, z3.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
"fmla z17.h, z7.h, z2.h[7]\n"
"fmla z21.h, z7.h, z3.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z18.h, z6.h, z2.h[7]\n"
@@ -1642,16 +1612,20 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z15.h, z7.h, z1.h[7]\n"
"fmla z19.h, z7.h, z2.h[7]\n"
"fmla z23.h, z7.h, z3.h[7]\n"
- "54:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "50:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 49b\n"
- "tbz %x[flags], #1, 55f\n"
+ "cmp x27, x19\n"
+ "bne 45b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "tbz %x[flags], #1, 51f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rh { z1.h }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1688,73 +1662,52 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmax z21.h, p5/M, z21.h, z1.h\n"
"fmax z22.h, p5/M, z22.h, z1.h\n"
"fmax z23.h, p5/M, z23.h, z1.h\n"
- "55:" // Height 4: No activation
- "st1h { z8.h }, p4, [x13]\n"
- "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
- "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
- "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x9]\n"
- "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1h { z16.h }, p4, [x27]\n"
- "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1h { z20.h }, p4, [x25]\n"
- "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "56:" // Height 4: Writeback done
- "dech x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 45b\n"
- "b 86f\n"
- "57:" // Height 5
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 58f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #1\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #1\n"
- "add x25, x25, x19, LSL #1\n"
- "add x23, x23, x19, LSL #1\n"
- "b 59f\n"
- "58:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "add x27, x9, x19, LSL #1\n"
- "add x25, x27, x19, LSL #1\n"
- "add x23, x25, x19, LSL #1\n"
- "59:" // Height 5: Column loop
+ "51:" // Height 4: No activation
+ "st1h { z8.h }, p4, [x28]\n"
+ "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1h { z12.h }, p4, [x24]\n"
+ "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x23]\n"
+ "st1h { z17.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x23, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x22]\n"
+ "st1h { z21.h }, p3, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x22, #3, MUL VL]\n"
+ "52:" // Height 4: Writeback done
+ "dech x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 41b\n"
+ "b 80f\n"
+ "53:" // Height 5
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "54:" // Height 5: Column loop
"mov x19, #0x0\n"
- "whilelt p4.h, x19, x16\n"
+ "whilelt p4.h, x19, x11\n"
"inch x19\n"
- "whilelt p3.h, x19, x16\n"
+ "whilelt p3.h, x19, x11\n"
"inch x19\n"
- "whilelt p2.h, x19, x16\n"
+ "whilelt p2.h, x19, x11\n"
"inch x19\n"
- "whilelt p1.h, x19, x16\n"
- "cbz x14, 60f\n"
- "ld1h { z8.h }, p5/Z, [x14]\n"
+ "whilelt p1.h, x19, x11\n"
+ "cbz x9, 55f\n"
+ "ld1h { z8.h }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
"mov z20.d, z8.d\n"
- "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z13.d, z9.d\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
@@ -1768,31 +1721,36 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov z25.d, z9.d\n"
"mov z26.d, z10.d\n"
"mov z27.d, z11.d\n"
- "b 62f\n"
- "60:" // Height 5: no bias
- "tbz %x[flags], #0, 61f\n"
- "ld1h { z8.h }, p4/Z, [x13]\n"
- "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x9]\n"
- "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x27]\n"
- "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x25]\n"
- "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z24.h }, p4/Z, [x23]\n"
- "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
- "b 62f\n"
- "61:" // Height 5: no accumulate
+ "b 57f\n"
+ "55:" // Height 5: no bias
+ "tbz %x[flags], #0, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1h { z8.h }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #1\n"
+ "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x22, x23, x19, LSL #1\n"
+ "ld1h { z12.h }, p4/Z, [x24]\n"
+ "add x21, x22, x19, LSL #1\n"
+ "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x23]\n"
+ "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x22]\n"
+ "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x21]\n"
+ "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 57f\n"
+ "56:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -1813,240 +1771,240 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
- "62:" // Height 5: setup done
- "mov x12, #0x0\n"
- "63:" // Height 5: String loop
+ "57:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 64f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 59f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 65f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 60f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
"add x22, x22, x19, LSL #1\n"
- "b 65f\n"
- "64:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "add x22, x24, x19, LSL #1\n"
- "65:" // Height 5: input setup done
- "cmp x11, #0x8\n"
- "ble 67f\n"
- "66:" // Height 5: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "add x21, x21, x19, LSL #1\n"
+ "b 60f\n"
+ "59:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "60:" // Height 5: input setup done
+ "cmp x26, #0x8\n"
+ "ble 62f\n"
+ "61:" // Height 5: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.h, z6.h, z2.h[0]\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1rqh { z4.h }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x21, x21, #0x10\n"
"fmla z20.h, z6.h, z3.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x8\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x8\n"
"fmla z24.h, z6.h, z4.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z17.h, z7.h, z2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z21.h, z7.h, z3.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"fmla z25.h, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
"fmla z18.h, z6.h, z2.h[0]\n"
"fmla z22.h, z6.h, z3.h[0]\n"
"fmla z26.h, z6.h, z4.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[0]\n"
"fmla z15.h, z7.h, z1.h[0]\n"
"fmla z19.h, z7.h, z2.h[0]\n"
"fmla z23.h, z7.h, z3.h[0]\n"
"fmla z27.h, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
"fmla z12.h, z6.h, z1.h[1]\n"
"fmla z16.h, z6.h, z2.h[1]\n"
"fmla z20.h, z6.h, z3.h[1]\n"
"fmla z24.h, z6.h, z4.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
"fmla z17.h, z7.h, z2.h[1]\n"
"fmla z21.h, z7.h, z3.h[1]\n"
"fmla z25.h, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z2.h[1]\n"
"fmla z22.h, z6.h, z3.h[1]\n"
"fmla z26.h, z6.h, z4.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[1]\n"
"fmla z15.h, z7.h, z1.h[1]\n"
"fmla z19.h, z7.h, z2.h[1]\n"
"fmla z23.h, z7.h, z3.h[1]\n"
"fmla z27.h, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
"fmla z12.h, z6.h, z1.h[2]\n"
"fmla z16.h, z6.h, z2.h[2]\n"
"fmla z20.h, z6.h, z3.h[2]\n"
"fmla z24.h, z6.h, z4.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
"fmla z17.h, z7.h, z2.h[2]\n"
"fmla z21.h, z7.h, z3.h[2]\n"
"fmla z25.h, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
"fmla z18.h, z6.h, z2.h[2]\n"
"fmla z22.h, z6.h, z3.h[2]\n"
"fmla z26.h, z6.h, z4.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[2]\n"
"fmla z15.h, z7.h, z1.h[2]\n"
"fmla z19.h, z7.h, z2.h[2]\n"
"fmla z23.h, z7.h, z3.h[2]\n"
"fmla z27.h, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
"fmla z12.h, z6.h, z1.h[3]\n"
"fmla z16.h, z6.h, z2.h[3]\n"
"fmla z20.h, z6.h, z3.h[3]\n"
"fmla z24.h, z6.h, z4.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
"fmla z17.h, z7.h, z2.h[3]\n"
"fmla z21.h, z7.h, z3.h[3]\n"
"fmla z25.h, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
"fmla z18.h, z6.h, z2.h[3]\n"
"fmla z22.h, z6.h, z3.h[3]\n"
"fmla z26.h, z6.h, z4.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z11.h, z7.h, z0.h[3]\n"
"fmla z15.h, z7.h, z1.h[3]\n"
"fmla z19.h, z7.h, z2.h[3]\n"
"fmla z23.h, z7.h, z3.h[3]\n"
"fmla z27.h, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
"fmla z12.h, z6.h, z1.h[4]\n"
"fmla z16.h, z6.h, z2.h[4]\n"
"fmla z20.h, z6.h, z3.h[4]\n"
"fmla z24.h, z6.h, z4.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
"fmla z17.h, z7.h, z2.h[4]\n"
"fmla z21.h, z7.h, z3.h[4]\n"
"fmla z25.h, z7.h, z4.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
"fmla z18.h, z6.h, z2.h[4]\n"
"fmla z22.h, z6.h, z3.h[4]\n"
"fmla z26.h, z6.h, z4.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[4]\n"
"fmla z15.h, z7.h, z1.h[4]\n"
"fmla z19.h, z7.h, z2.h[4]\n"
"fmla z23.h, z7.h, z3.h[4]\n"
"fmla z27.h, z7.h, z4.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
"fmla z12.h, z6.h, z1.h[5]\n"
"fmla z16.h, z6.h, z2.h[5]\n"
"fmla z20.h, z6.h, z3.h[5]\n"
"fmla z24.h, z6.h, z4.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
"fmla z17.h, z7.h, z2.h[5]\n"
"fmla z21.h, z7.h, z3.h[5]\n"
"fmla z25.h, z7.h, z4.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
"fmla z18.h, z6.h, z2.h[5]\n"
"fmla z22.h, z6.h, z3.h[5]\n"
"fmla z26.h, z6.h, z4.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[5]\n"
"fmla z15.h, z7.h, z1.h[5]\n"
"fmla z19.h, z7.h, z2.h[5]\n"
"fmla z23.h, z7.h, z3.h[5]\n"
"fmla z27.h, z7.h, z4.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
"fmla z12.h, z6.h, z1.h[6]\n"
"fmla z16.h, z6.h, z2.h[6]\n"
"fmla z20.h, z6.h, z3.h[6]\n"
"fmla z24.h, z6.h, z4.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
"fmla z17.h, z7.h, z2.h[6]\n"
"fmla z21.h, z7.h, z3.h[6]\n"
"fmla z25.h, z7.h, z4.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
"fmla z18.h, z6.h, z2.h[6]\n"
"fmla z22.h, z6.h, z3.h[6]\n"
"fmla z26.h, z6.h, z4.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[6]\n"
"fmla z15.h, z7.h, z1.h[6]\n"
"fmla z19.h, z7.h, z2.h[6]\n"
"fmla z23.h, z7.h, z3.h[6]\n"
"fmla z27.h, z7.h, z4.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
"fmla z16.h, z6.h, z2.h[7]\n"
"fmla z20.h, z6.h, z3.h[7]\n"
"fmla z24.h, z6.h, z4.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
"fmla z17.h, z7.h, z2.h[7]\n"
"fmla z21.h, z7.h, z3.h[7]\n"
"fmla z25.h, z7.h, z4.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z18.h, z6.h, z2.h[7]\n"
@@ -2057,35 +2015,35 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z19.h, z7.h, z2.h[7]\n"
"fmla z23.h, z7.h, z3.h[7]\n"
"fmla z27.h, z7.h, z4.h[7]\n"
- "bgt 66b\n"
- "67:" // Height 5: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 61b\n"
+ "62:" // Height 5: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.h, z6.h, z2.h[0]\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "fmla z13.h, z7.h, z1.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "add x21, x21, #0x10\n"
"fmla z17.h, z7.h, z2.h[0]\n"
"fmla z20.h, z6.h, z3.h[0]\n"
"fmla z24.h, z6.h, z4.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z21.h, z7.h, z3.h[0]\n"
"fmla z25.h, z7.h, z4.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
"fmla z18.h, z6.h, z2.h[0]\n"
@@ -2096,23 +2054,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z19.h, z7.h, z2.h[0]\n"
"fmla z23.h, z7.h, z3.h[0]\n"
"fmla z27.h, z7.h, z4.h[0]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[1]\n"
"fmla z16.h, z6.h, z2.h[1]\n"
"fmla z20.h, z6.h, z3.h[1]\n"
"fmla z24.h, z6.h, z4.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
"fmla z17.h, z7.h, z2.h[1]\n"
"fmla z21.h, z7.h, z3.h[1]\n"
"fmla z25.h, z7.h, z4.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z2.h[1]\n"
@@ -2123,23 +2081,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z19.h, z7.h, z2.h[1]\n"
"fmla z23.h, z7.h, z3.h[1]\n"
"fmla z27.h, z7.h, z4.h[1]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[2]\n"
"fmla z16.h, z6.h, z2.h[2]\n"
"fmla z20.h, z6.h, z3.h[2]\n"
"fmla z24.h, z6.h, z4.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
"fmla z17.h, z7.h, z2.h[2]\n"
"fmla z21.h, z7.h, z3.h[2]\n"
"fmla z25.h, z7.h, z4.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
"fmla z18.h, z6.h, z2.h[2]\n"
@@ -2150,23 +2108,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z19.h, z7.h, z2.h[2]\n"
"fmla z23.h, z7.h, z3.h[2]\n"
"fmla z27.h, z7.h, z4.h[2]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[3]\n"
"fmla z16.h, z6.h, z2.h[3]\n"
"fmla z20.h, z6.h, z3.h[3]\n"
"fmla z24.h, z6.h, z4.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
"fmla z17.h, z7.h, z2.h[3]\n"
"fmla z21.h, z7.h, z3.h[3]\n"
"fmla z25.h, z7.h, z4.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
"fmla z18.h, z6.h, z2.h[3]\n"
@@ -2177,23 +2135,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z19.h, z7.h, z2.h[3]\n"
"fmla z23.h, z7.h, z3.h[3]\n"
"fmla z27.h, z7.h, z4.h[3]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[4]\n"
"fmla z16.h, z6.h, z2.h[4]\n"
"fmla z20.h, z6.h, z3.h[4]\n"
"fmla z24.h, z6.h, z4.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
"fmla z17.h, z7.h, z2.h[4]\n"
"fmla z21.h, z7.h, z3.h[4]\n"
"fmla z25.h, z7.h, z4.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
"fmla z18.h, z6.h, z2.h[4]\n"
@@ -2204,23 +2162,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z19.h, z7.h, z2.h[4]\n"
"fmla z23.h, z7.h, z3.h[4]\n"
"fmla z27.h, z7.h, z4.h[4]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[5]\n"
"fmla z16.h, z6.h, z2.h[5]\n"
"fmla z20.h, z6.h, z3.h[5]\n"
"fmla z24.h, z6.h, z4.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
"fmla z17.h, z7.h, z2.h[5]\n"
"fmla z21.h, z7.h, z3.h[5]\n"
"fmla z25.h, z7.h, z4.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
"fmla z18.h, z6.h, z2.h[5]\n"
@@ -2231,23 +2189,23 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z19.h, z7.h, z2.h[5]\n"
"fmla z23.h, z7.h, z3.h[5]\n"
"fmla z27.h, z7.h, z4.h[5]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[6]\n"
"fmla z16.h, z6.h, z2.h[6]\n"
"fmla z20.h, z6.h, z3.h[6]\n"
"fmla z24.h, z6.h, z4.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
"fmla z17.h, z7.h, z2.h[6]\n"
"fmla z21.h, z7.h, z3.h[6]\n"
"fmla z25.h, z7.h, z4.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
"fmla z18.h, z6.h, z2.h[6]\n"
@@ -2258,22 +2216,22 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z19.h, z7.h, z2.h[6]\n"
"fmla z23.h, z7.h, z3.h[6]\n"
"fmla z27.h, z7.h, z4.h[6]\n"
- "ble 68f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
"fmla z16.h, z6.h, z2.h[7]\n"
"fmla z20.h, z6.h, z3.h[7]\n"
"fmla z24.h, z6.h, z4.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
"fmla z17.h, z7.h, z2.h[7]\n"
"fmla z21.h, z7.h, z3.h[7]\n"
"fmla z25.h, z7.h, z4.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z18.h, z6.h, z2.h[7]\n"
@@ -2284,17 +2242,22 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z19.h, z7.h, z2.h[7]\n"
"fmla z23.h, z7.h, z3.h[7]\n"
"fmla z27.h, z7.h, z4.h[7]\n"
- "68:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "63:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 63b\n"
- "tbz %x[flags], #1, 69f\n"
+ "cmp x27, x19\n"
+ "bne 58b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "tbz %x[flags], #1, 64f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rh { z1.h }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -2339,83 +2302,59 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmax z25.h, p5/M, z25.h, z1.h\n"
"fmax z26.h, p5/M, z26.h, z1.h\n"
"fmax z27.h, p5/M, z27.h, z1.h\n"
- "69:" // Height 5: No activation
- "st1h { z8.h }, p4, [x13]\n"
- "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
- "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
- "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x9]\n"
- "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1h { z16.h }, p4, [x27]\n"
- "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1h { z20.h }, p4, [x25]\n"
- "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1h { z24.h }, p4, [x23]\n"
- "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "70:" // Height 5: Writeback done
- "dech x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 59b\n"
- "b 86f\n"
- "71:" // Height 6
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 72f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #1\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #1\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #1\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #1\n"
- "add x23, x23, x19, LSL #1\n"
- "add x21, x21, x19, LSL #1\n"
- "b 73f\n"
- "72:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #1\n"
- "add x27, x9, x19, LSL #1\n"
- "add x25, x27, x19, LSL #1\n"
- "add x23, x25, x19, LSL #1\n"
- "add x21, x23, x19, LSL #1\n"
- "add %x[output_ptr], x21, x19, LSL #1\n"
- "73:" // Height 6: Column loop
+ "64:" // Height 5: No activation
+ "st1h { z8.h }, p4, [x28]\n"
+ "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1h { z12.h }, p4, [x24]\n"
+ "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x23]\n"
+ "st1h { z17.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x23, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x22]\n"
+ "st1h { z21.h }, p3, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x22, #3, MUL VL]\n"
+ "st1h { z24.h }, p4, [x21]\n"
+ "st1h { z25.h }, p3, [x21, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x21, #3, MUL VL]\n"
+ "65:" // Height 5: Writeback done
+ "dech x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 54b\n"
+ "b 80f\n"
+ "66:" // Height 6
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0xc\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "67:" // Height 6: Column loop
"mov x19, #0x0\n"
- "whilelt p4.h, x19, x16\n"
+ "whilelt p4.h, x19, x11\n"
"inch x19\n"
- "whilelt p3.h, x19, x16\n"
+ "whilelt p3.h, x19, x11\n"
"inch x19\n"
- "whilelt p2.h, x19, x16\n"
+ "whilelt p2.h, x19, x11\n"
"inch x19\n"
- "whilelt p1.h, x19, x16\n"
- "cbz x14, 74f\n"
- "ld1h { z8.h }, p5/Z, [x14]\n"
+ "whilelt p1.h, x19, x11\n"
+ "cbz x9, 68f\n"
+ "ld1h { z8.h }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1h { z9.h }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x9, #2, MUL VL]\n"
"mov z20.d, z8.d\n"
- "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1h { z11.h }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z13.d, z9.d\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
@@ -2433,35 +2372,41 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov z29.d, z9.d\n"
"mov z30.d, z10.d\n"
"mov z31.d, z11.d\n"
- "b 76f\n"
- "74:" // Height 6: no bias
- "tbz %x[flags], #0, 75f\n"
- "ld1h { z8.h }, p4/Z, [x13]\n"
- "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1h { z12.h }, p4/Z, [x9]\n"
- "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1h { z16.h }, p4/Z, [x27]\n"
- "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1h { z20.h }, p4/Z, [x25]\n"
- "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1h { z24.h }, p4/Z, [x23]\n"
- "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1h { z28.h }, p4/Z, [x21]\n"
- "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n"
- "b 76f\n"
- "75:" // Height 6: no accumulate
+ "b 70f\n"
+ "68:" // Height 6: no bias
+ "tbz %x[flags], #0, 69f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1h { z8.h }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "ld1h { z9.h }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #1\n"
+ "ld1h { z11.h }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x22, x23, x19, LSL #1\n"
+ "ld1h { z12.h }, p4/Z, [x24]\n"
+ "add x21, x22, x19, LSL #1\n"
+ "ld1h { z13.h }, p3/Z, [x24, #1, MUL VL]\n"
+ "add x20, x21, x19, LSL #1\n"
+ "ld1h { z14.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x23]\n"
+ "ld1h { z17.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x22]\n"
+ "ld1h { z21.h }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x21]\n"
+ "ld1h { z25.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1h { z28.h }, p4/Z, [x20]\n"
+ "ld1h { z29.h }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1h { z30.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1h { z31.h }, p1/Z, [x20, #3, MUL VL]\n"
+ "b 70f\n"
+ "69:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -2486,77 +2431,77 @@ void sve_hybrid_fp16_mla_6x4VL (
"mov z29.b, #0x0\n"
"mov z30.b, #0x0\n"
"mov z31.b, #0x0\n"
- "76:" // Height 6: setup done
- "mov x12, #0x0\n"
- "77:" // Height 6: String loop
+ "70:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 78f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 72f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 79f\n"
+ "cbnz x27, 73f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #1\n"
- "add x28, x28, x19, LSL #1\n"
- "add x26, x26, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
"add x24, x24, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
"add x22, x22, x19, LSL #1\n"
+ "add x21, x21, x19, LSL #1\n"
"add x20, x20, x19, LSL #1\n"
- "b 79f\n"
- "78:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #1\n"
- "add x26, x28, x19, LSL #1\n"
- "add x24, x26, x19, LSL #1\n"
- "add x22, x24, x19, LSL #1\n"
- "add x20, x22, x19, LSL #1\n"
- "79:" // Height 6: input setup done
- "cmp x11, #0x8\n"
- "ble 81f\n"
- "80:" // Height 6: Multiply loop: Main loop head
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x8\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "add x20, x21, x19, LSL #1\n"
+ "73:" // Height 6: input setup done
+ "cmp x26, #0x8\n"
+ "ble 75f\n"
+ "74:" // Height 6: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.h, z6.h, z2.h[0]\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqh { z4.h }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"fmla z13.h, z7.h, z1.h[0]\n"
"ld1rqh { z5.h }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"fmla z20.h, z6.h, z3.h[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"add x20, x20, #0x10\n"
"fmla z24.h, z6.h, z4.h[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x8\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x8\n"
"fmla z28.h, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z17.h, z7.h, z2.h[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z21.h, z7.h, z3.h[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "fmla z25.h, z7.h, z4.h[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z25.h, z7.h, z4.h[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla z29.h, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
@@ -2564,198 +2509,198 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z22.h, z6.h, z3.h[0]\n"
"fmla z26.h, z6.h, z4.h[0]\n"
"fmla z30.h, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[0]\n"
"fmla z15.h, z7.h, z1.h[0]\n"
"fmla z19.h, z7.h, z2.h[0]\n"
"fmla z23.h, z7.h, z3.h[0]\n"
"fmla z27.h, z7.h, z4.h[0]\n"
"fmla z31.h, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
"fmla z12.h, z6.h, z1.h[1]\n"
"fmla z16.h, z6.h, z2.h[1]\n"
"fmla z20.h, z6.h, z3.h[1]\n"
"fmla z24.h, z6.h, z4.h[1]\n"
"fmla z28.h, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
"fmla z17.h, z7.h, z2.h[1]\n"
"fmla z21.h, z7.h, z3.h[1]\n"
"fmla z25.h, z7.h, z4.h[1]\n"
"fmla z29.h, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z2.h[1]\n"
"fmla z22.h, z6.h, z3.h[1]\n"
"fmla z26.h, z6.h, z4.h[1]\n"
"fmla z30.h, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[1]\n"
"fmla z15.h, z7.h, z1.h[1]\n"
"fmla z19.h, z7.h, z2.h[1]\n"
"fmla z23.h, z7.h, z3.h[1]\n"
"fmla z27.h, z7.h, z4.h[1]\n"
"fmla z31.h, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
"fmla z12.h, z6.h, z1.h[2]\n"
"fmla z16.h, z6.h, z2.h[2]\n"
"fmla z20.h, z6.h, z3.h[2]\n"
"fmla z24.h, z6.h, z4.h[2]\n"
"fmla z28.h, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
"fmla z17.h, z7.h, z2.h[2]\n"
"fmla z21.h, z7.h, z3.h[2]\n"
"fmla z25.h, z7.h, z4.h[2]\n"
"fmla z29.h, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
"fmla z18.h, z6.h, z2.h[2]\n"
"fmla z22.h, z6.h, z3.h[2]\n"
"fmla z26.h, z6.h, z4.h[2]\n"
"fmla z30.h, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[2]\n"
"fmla z15.h, z7.h, z1.h[2]\n"
"fmla z19.h, z7.h, z2.h[2]\n"
"fmla z23.h, z7.h, z3.h[2]\n"
"fmla z27.h, z7.h, z4.h[2]\n"
"fmla z31.h, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
"fmla z12.h, z6.h, z1.h[3]\n"
"fmla z16.h, z6.h, z2.h[3]\n"
"fmla z20.h, z6.h, z3.h[3]\n"
"fmla z24.h, z6.h, z4.h[3]\n"
"fmla z28.h, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
"fmla z17.h, z7.h, z2.h[3]\n"
"fmla z21.h, z7.h, z3.h[3]\n"
"fmla z25.h, z7.h, z4.h[3]\n"
"fmla z29.h, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
"fmla z18.h, z6.h, z2.h[3]\n"
"fmla z22.h, z6.h, z3.h[3]\n"
"fmla z26.h, z6.h, z4.h[3]\n"
"fmla z30.h, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z11.h, z7.h, z0.h[3]\n"
"fmla z15.h, z7.h, z1.h[3]\n"
"fmla z19.h, z7.h, z2.h[3]\n"
"fmla z23.h, z7.h, z3.h[3]\n"
"fmla z27.h, z7.h, z4.h[3]\n"
"fmla z31.h, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
"fmla z12.h, z6.h, z1.h[4]\n"
"fmla z16.h, z6.h, z2.h[4]\n"
"fmla z20.h, z6.h, z3.h[4]\n"
"fmla z24.h, z6.h, z4.h[4]\n"
"fmla z28.h, z6.h, z5.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
"fmla z17.h, z7.h, z2.h[4]\n"
"fmla z21.h, z7.h, z3.h[4]\n"
"fmla z25.h, z7.h, z4.h[4]\n"
"fmla z29.h, z7.h, z5.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
"fmla z18.h, z6.h, z2.h[4]\n"
"fmla z22.h, z6.h, z3.h[4]\n"
"fmla z26.h, z6.h, z4.h[4]\n"
"fmla z30.h, z6.h, z5.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[4]\n"
"fmla z15.h, z7.h, z1.h[4]\n"
"fmla z19.h, z7.h, z2.h[4]\n"
"fmla z23.h, z7.h, z3.h[4]\n"
"fmla z27.h, z7.h, z4.h[4]\n"
"fmla z31.h, z7.h, z5.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
"fmla z12.h, z6.h, z1.h[5]\n"
"fmla z16.h, z6.h, z2.h[5]\n"
"fmla z20.h, z6.h, z3.h[5]\n"
"fmla z24.h, z6.h, z4.h[5]\n"
"fmla z28.h, z6.h, z5.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
"fmla z17.h, z7.h, z2.h[5]\n"
"fmla z21.h, z7.h, z3.h[5]\n"
"fmla z25.h, z7.h, z4.h[5]\n"
"fmla z29.h, z7.h, z5.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1h { z7.h }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
"fmla z18.h, z6.h, z2.h[5]\n"
"fmla z22.h, z6.h, z3.h[5]\n"
"fmla z26.h, z6.h, z4.h[5]\n"
"fmla z30.h, z6.h, z5.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[5]\n"
"fmla z15.h, z7.h, z1.h[5]\n"
"fmla z19.h, z7.h, z2.h[5]\n"
"fmla z23.h, z7.h, z3.h[5]\n"
"fmla z27.h, z7.h, z4.h[5]\n"
"fmla z31.h, z7.h, z5.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
"fmla z12.h, z6.h, z1.h[6]\n"
"fmla z16.h, z6.h, z2.h[6]\n"
"fmla z20.h, z6.h, z3.h[6]\n"
"fmla z24.h, z6.h, z4.h[6]\n"
"fmla z28.h, z6.h, z5.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
"fmla z17.h, z7.h, z2.h[6]\n"
"fmla z21.h, z7.h, z3.h[6]\n"
"fmla z25.h, z7.h, z4.h[6]\n"
"fmla z29.h, z7.h, z5.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
"fmla z18.h, z6.h, z2.h[6]\n"
"fmla z22.h, z6.h, z3.h[6]\n"
"fmla z26.h, z6.h, z4.h[6]\n"
"fmla z30.h, z6.h, z5.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.h, z7.h, z0.h[6]\n"
"fmla z15.h, z7.h, z1.h[6]\n"
"fmla z19.h, z7.h, z2.h[6]\n"
"fmla z23.h, z7.h, z3.h[6]\n"
"fmla z27.h, z7.h, z4.h[6]\n"
"fmla z31.h, z7.h, z5.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
"fmla z16.h, z6.h, z2.h[7]\n"
"fmla z20.h, z6.h, z3.h[7]\n"
"fmla z24.h, z6.h, z4.h[7]\n"
"fmla z28.h, z6.h, z5.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
"fmla z17.h, z7.h, z2.h[7]\n"
"fmla z21.h, z7.h, z3.h[7]\n"
"fmla z25.h, z7.h, z4.h[7]\n"
"fmla z29.h, z7.h, z5.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z18.h, z6.h, z2.h[7]\n"
@@ -2768,39 +2713,39 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z23.h, z7.h, z3.h[7]\n"
"fmla z27.h, z7.h, z4.h[7]\n"
"fmla z31.h, z7.h, z5.h[7]\n"
- "bgt 80b\n"
- "81:" // Height 6: Multiply loop: Single iteration only
- "ld1h { z6.h }, p5/Z, [x15]\n"
- "whilelt p0.h, XZR, x11\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "bgt 74b\n"
+ "75:" // Height 6: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x10]\n"
+ "whilelt p0.h, XZR, x26\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x25]\n"
"fmla z8.h, z6.h, z0.h[0]\n"
- "ld1rqh { z1.h }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqh { z1.h }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.h, z7.h, z0.h[0]\n"
- "ld1rqh { z2.h }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqh { z2.h }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.h, z6.h, z1.h[0]\n"
- "ld1rqh { z3.h }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqh { z3.h }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.h, z6.h, z2.h[0]\n"
- "ld1rqh { z4.h }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqh { z4.h }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"fmla z13.h, z7.h, z1.h[0]\n"
"ld1rqh { z5.h }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"fmla z20.h, z6.h, z3.h[0]\n"
"add x20, x20, #0x10\n"
"fmla z17.h, z7.h, z2.h[0]\n"
"fmla z24.h, z6.h, z4.h[0]\n"
"fmla z28.h, z6.h, z5.h[0]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z21.h, z7.h, z3.h[0]\n"
"fmla z25.h, z7.h, z4.h[0]\n"
"fmla z29.h, z7.h, z5.h[0]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[0]\n"
"fmla z14.h, z6.h, z1.h[0]\n"
"fmla z18.h, z6.h, z2.h[0]\n"
@@ -2813,25 +2758,25 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z23.h, z7.h, z3.h[0]\n"
"fmla z27.h, z7.h, z4.h[0]\n"
"fmla z31.h, z7.h, z5.h[0]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[1]\n"
"fmla z16.h, z6.h, z2.h[1]\n"
"fmla z20.h, z6.h, z3.h[1]\n"
"fmla z24.h, z6.h, z4.h[1]\n"
"fmla z28.h, z6.h, z5.h[1]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[1]\n"
"fmla z13.h, z7.h, z1.h[1]\n"
"fmla z17.h, z7.h, z2.h[1]\n"
"fmla z21.h, z7.h, z3.h[1]\n"
"fmla z25.h, z7.h, z4.h[1]\n"
"fmla z29.h, z7.h, z5.h[1]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[1]\n"
"fmla z14.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z2.h[1]\n"
@@ -2844,25 +2789,25 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z23.h, z7.h, z3.h[1]\n"
"fmla z27.h, z7.h, z4.h[1]\n"
"fmla z31.h, z7.h, z5.h[1]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[2]\n"
"fmla z16.h, z6.h, z2.h[2]\n"
"fmla z20.h, z6.h, z3.h[2]\n"
"fmla z24.h, z6.h, z4.h[2]\n"
"fmla z28.h, z6.h, z5.h[2]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[2]\n"
"fmla z13.h, z7.h, z1.h[2]\n"
"fmla z17.h, z7.h, z2.h[2]\n"
"fmla z21.h, z7.h, z3.h[2]\n"
"fmla z25.h, z7.h, z4.h[2]\n"
"fmla z29.h, z7.h, z5.h[2]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[2]\n"
"fmla z14.h, z6.h, z1.h[2]\n"
"fmla z18.h, z6.h, z2.h[2]\n"
@@ -2875,25 +2820,25 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z23.h, z7.h, z3.h[2]\n"
"fmla z27.h, z7.h, z4.h[2]\n"
"fmla z31.h, z7.h, z5.h[2]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[3]\n"
"fmla z16.h, z6.h, z2.h[3]\n"
"fmla z20.h, z6.h, z3.h[3]\n"
"fmla z24.h, z6.h, z4.h[3]\n"
"fmla z28.h, z6.h, z5.h[3]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[3]\n"
"fmla z13.h, z7.h, z1.h[3]\n"
"fmla z17.h, z7.h, z2.h[3]\n"
"fmla z21.h, z7.h, z3.h[3]\n"
"fmla z25.h, z7.h, z4.h[3]\n"
"fmla z29.h, z7.h, z5.h[3]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[3]\n"
"fmla z14.h, z6.h, z1.h[3]\n"
"fmla z18.h, z6.h, z2.h[3]\n"
@@ -2906,25 +2851,25 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z23.h, z7.h, z3.h[3]\n"
"fmla z27.h, z7.h, z4.h[3]\n"
"fmla z31.h, z7.h, z5.h[3]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[4]\n"
"fmla z16.h, z6.h, z2.h[4]\n"
"fmla z20.h, z6.h, z3.h[4]\n"
"fmla z24.h, z6.h, z4.h[4]\n"
"fmla z28.h, z6.h, z5.h[4]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[4]\n"
"fmla z13.h, z7.h, z1.h[4]\n"
"fmla z17.h, z7.h, z2.h[4]\n"
"fmla z21.h, z7.h, z3.h[4]\n"
"fmla z25.h, z7.h, z4.h[4]\n"
"fmla z29.h, z7.h, z5.h[4]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[4]\n"
"fmla z14.h, z6.h, z1.h[4]\n"
"fmla z18.h, z6.h, z2.h[4]\n"
@@ -2937,25 +2882,25 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z23.h, z7.h, z3.h[4]\n"
"fmla z27.h, z7.h, z4.h[4]\n"
"fmla z31.h, z7.h, z5.h[4]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[5]\n"
"fmla z16.h, z6.h, z2.h[5]\n"
"fmla z20.h, z6.h, z3.h[5]\n"
"fmla z24.h, z6.h, z4.h[5]\n"
"fmla z28.h, z6.h, z5.h[5]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[5]\n"
"fmla z13.h, z7.h, z1.h[5]\n"
"fmla z17.h, z7.h, z2.h[5]\n"
"fmla z21.h, z7.h, z3.h[5]\n"
"fmla z25.h, z7.h, z4.h[5]\n"
"fmla z29.h, z7.h, z5.h[5]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[5]\n"
"fmla z14.h, z6.h, z1.h[5]\n"
"fmla z18.h, z6.h, z2.h[5]\n"
@@ -2968,25 +2913,25 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z23.h, z7.h, z3.h[5]\n"
"fmla z27.h, z7.h, z4.h[5]\n"
"fmla z31.h, z7.h, z5.h[5]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.h, z6.h, z1.h[6]\n"
"fmla z16.h, z6.h, z2.h[6]\n"
"fmla z20.h, z6.h, z3.h[6]\n"
"fmla z24.h, z6.h, z4.h[6]\n"
"fmla z28.h, z6.h, z5.h[6]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[6]\n"
"fmla z13.h, z7.h, z1.h[6]\n"
"fmla z17.h, z7.h, z2.h[6]\n"
"fmla z21.h, z7.h, z3.h[6]\n"
"fmla z25.h, z7.h, z4.h[6]\n"
"fmla z29.h, z7.h, z5.h[6]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[6]\n"
"fmla z14.h, z6.h, z1.h[6]\n"
"fmla z18.h, z6.h, z2.h[6]\n"
@@ -2999,24 +2944,24 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z23.h, z7.h, z3.h[6]\n"
"fmla z27.h, z7.h, z4.h[6]\n"
"fmla z31.h, z7.h, z5.h[6]\n"
- "ble 82f\n"
- "ld1h { z6.h }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1h { z6.h }, p5/Z, [x10]\n"
"fmla z8.h, z6.h, z0.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.h, z6.h, z1.h[7]\n"
"fmla z16.h, z6.h, z2.h[7]\n"
"fmla z20.h, z6.h, z3.h[7]\n"
"fmla z24.h, z6.h, z4.h[7]\n"
"fmla z28.h, z6.h, z5.h[7]\n"
- "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.h, z7.h, z0.h[7]\n"
"fmla z13.h, z7.h, z1.h[7]\n"
"fmla z17.h, z7.h, z2.h[7]\n"
"fmla z21.h, z7.h, z3.h[7]\n"
"fmla z25.h, z7.h, z4.h[7]\n"
"fmla z29.h, z7.h, z5.h[7]\n"
- "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.h, z6.h, z0.h[7]\n"
"fmla z14.h, z6.h, z1.h[7]\n"
"fmla z18.h, z6.h, z2.h[7]\n"
@@ -3029,18 +2974,24 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmla z23.h, z7.h, z3.h[7]\n"
"fmla z27.h, z7.h, z4.h[7]\n"
"fmla z31.h, z7.h, z5.h[7]\n"
- "82:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "76:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 77b\n"
- "tbz %x[flags], #1, 83f\n"
+ "cmp x27, x19\n"
+ "bne 71b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #1\n"
+ "add x23, x24, x19, LSL #1\n"
+ "add x22, x23, x19, LSL #1\n"
+ "add x21, x22, x19, LSL #1\n"
+ "add x20, x21, x19, LSL #1\n"
+ "tbz %x[flags], #1, 77f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rh { z1.h }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -3093,57 +3044,52 @@ void sve_hybrid_fp16_mla_6x4VL (
"fmax z29.h, p5/M, z29.h, z1.h\n"
"fmax z30.h, p5/M, z30.h, z1.h\n"
"fmax z31.h, p5/M, z31.h, z1.h\n"
- "83:" // Height 6: No activation
- "st1h { z8.h }, p4, [x13]\n"
- "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
- "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
- "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1h { z12.h }, p4, [x9]\n"
- "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
- "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
- "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1h { z16.h }, p4, [x27]\n"
- "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
- "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
- "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1h { z20.h }, p4, [x25]\n"
- "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
- "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
- "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1h { z24.h }, p4, [x23]\n"
- "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
- "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
- "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "st1h { z28.h }, p4, [x21]\n"
- "st1h { z29.h }, p3, [x21, #1, MUL VL]\n"
- "st1h { z30.h }, p2, [x21, #2, MUL VL]\n"
- "st1h { z31.h }, p1, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #4\n"
- "84:" // Height 6: Writeback done
- "dech x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 73b\n"
+ "77:" // Height 6: No activation
+ "st1h { z8.h }, p4, [x28]\n"
+ "st1h { z9.h }, p3, [x28, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x28, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1h { z12.h }, p4, [x24]\n"
+ "st1h { z13.h }, p3, [x24, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x24, #3, MUL VL]\n"
+ "st1h { z16.h }, p4, [x23]\n"
+ "st1h { z17.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x23, #3, MUL VL]\n"
+ "st1h { z20.h }, p4, [x22]\n"
+ "st1h { z21.h }, p3, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x22, #3, MUL VL]\n"
+ "st1h { z24.h }, p4, [x21]\n"
+ "st1h { z25.h }, p3, [x21, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x21, #3, MUL VL]\n"
+ "st1h { z28.h }, p4, [x20]\n"
+ "st1h { z29.h }, p3, [x20, #1, MUL VL]\n"
+ "st1h { z30.h }, p2, [x20, #2, MUL VL]\n"
+ "st1h { z31.h }, p1, [x20, #3, MUL VL]\n"
+ "78:" // Height 6: Writeback done
+ "dech x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 67b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 86f\n"
+ "beq 80f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 85f\n"
+ "tbz %x[flags], #3, 79f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "85:" // Update direct input
+ "79:" // Update direct input
"mov x19, #0xc\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "86:" // Exit
+ "80:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
index 3c369eb35a..b696e73637 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -36,7 +36,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void sve_hybrid_fp32_mla_6x4VL( ARGLIST );
@@ -73,7 +72,6 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_fp32_mla_6x4VL;
-
cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
index db29ebc23c..dee9a107ff 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -27,6 +27,7 @@
#include "../../utils.hpp"
#include <cassert>
+#include <limits>
namespace arm_gemm {
@@ -94,164 +95,158 @@ void sve_hybrid_fp32_mla_6x4VL (
"ptrue p5.b\n"
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 71f\n"
+ "bge 66f\n"
"cmp %x[M], #0x4\n"
- "bgt 57f\n"
- "beq 43f\n"
+ "bgt 53f\n"
+ "beq 40f\n"
"cmp %x[M], #0x2\n"
- "bgt 29f\n"
- "beq 15f\n"
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x14, %x[bias]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "bgt 27f\n"
+ "beq 14f\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x9, %x[bias]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 4f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
- "b 6f\n"
- "4:" // Height 1: no bias
- "tbz %x[flags], #0, 5f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "b 6f\n"
- "5:" // Height 1: no accumulate
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 3f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "b 5f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 4f\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "b 5f\n"
+ "4:" // Height 1: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
"mov z11.b, #0x0\n"
- "6:" // Height 1: setup done
- "mov x12, #0x0\n"
- "7:" // Height 1: String loop
+ "5:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 8f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 7f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 9f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 8f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "b 9f\n"
- "8:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "9:" // Height 1: input setup done
- "cmp x11, #0x4\n"
- "ble 11f\n"
- "10:" // Height 1: Multiply loop: Main loop head
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 8f\n"
+ "7:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "8:" // Height 1: input setup done
+ "cmp x26, #0x4\n"
+ "ble 10f\n"
+ "9:" // Height 1: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "cmp x11, #0x4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "cmp x26, #0x4\n"
"fmla z10.s, z6.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla z11.s, z7.s, z0.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.s, z6.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z11.s, z7.s, z0.s[3]\n"
- "bgt 10b\n"
- "11:" // Height 1: Multiply loop: Single iteration only
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "bgt 9b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"fmla z11.s, z7.s, z0.s[0]\n"
- "ble 12f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z9.s, z7.s, z0.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[1]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
"fmla z11.s, z7.s, z0.s[1]\n"
- "ble 12f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z9.s, z7.s, z0.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[2]\n"
- "addvl x15, x15, #4\n"
+ "addvl x10, x10, #4\n"
"fmla z11.s, z7.s, z0.s[2]\n"
- "ble 12f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 11f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z11.s, z7.s, z0.s[3]\n"
- "12:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 7b\n"
- "tbz %x[flags], #1, 13f\n"
+ "cmp x27, x19\n"
+ "bne 6b\n"
+ "tbz %x[flags], #1, 12f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -264,63 +259,56 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmax z9.s, p5/M, z9.s, z1.s\n"
"fmax z10.s, p5/M, z10.s, z1.s\n"
"fmax z11.s, p5/M, z11.s, z1.s\n"
- "13:" // Height 1: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "14:" // Height 1: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 3b\n"
- "b 86f\n"
- "15:" // Height 2
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 16f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #2\n"
- "b 17f\n"
- "16:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "17:" // Height 2: Column loop
+ "12:" // Height 1: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "13:" // Height 1: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 2b\n"
+ "b 80f\n"
+ "14:" // Height 2
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "15:" // Height 2: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 18f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 16f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
"mov z13.d, z9.d\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
- "b 20f\n"
- "18:" // Height 2: no bias
- "tbz %x[flags], #0, 19f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "b 20f\n"
- "19:" // Height 2: no accumulate
+ "b 18f\n"
+ "16:" // Height 2: no bias
+ "tbz %x[flags], #0, 17f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "b 18f\n"
+ "17:" // Height 2: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -329,160 +317,162 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov z13.b, #0x0\n"
"mov z14.b, #0x0\n"
"mov z15.b, #0x0\n"
- "20:" // Height 2: setup done
- "mov x12, #0x0\n"
- "21:" // Height 2: String loop
+ "18:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 22f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 20f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 23f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 21f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "b 23f\n"
- "22:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "23:" // Height 2: input setup done
- "cmp x11, #0x4\n"
- "ble 25f\n"
- "24:" // Height 2: Multiply loop: Main loop head
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 21f\n"
+ "20:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "21:" // Height 2: input setup done
+ "cmp x26, #0x4\n"
+ "ble 23f\n"
+ "22:" // Height 2: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
- "cmp x11, #0x4\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "cmp x26, #0x4\n"
"fmla z13.s, z7.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[0]\n"
"fmla z15.s, z7.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
"fmla z12.s, z6.s, z1.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[1]\n"
"fmla z15.s, z7.s, z1.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
"fmla z12.s, z6.s, z1.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[2]\n"
"fmla z15.s, z7.s, z1.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z11.s, z7.s, z0.s[3]\n"
"fmla z15.s, z7.s, z1.s[3]\n"
- "bgt 24b\n"
- "25:" // Height 2: Multiply loop: Single iteration only
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "bgt 22b\n"
+ "23:" // Height 2: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z13.s, z7.s, z1.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
"fmla z11.s, z7.s, z0.s[0]\n"
"fmla z15.s, z7.s, z1.s[0]\n"
- "ble 26f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
"fmla z11.s, z7.s, z0.s[1]\n"
"fmla z15.s, z7.s, z1.s[1]\n"
- "ble 26f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
"fmla z11.s, z7.s, z0.s[2]\n"
"fmla z15.s, z7.s, z1.s[2]\n"
- "ble 26f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 24f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z11.s, z7.s, z0.s[3]\n"
"fmla z15.s, z7.s, z1.s[3]\n"
- "26:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "24:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 21b\n"
- "tbz %x[flags], #1, 27f\n"
+ "cmp x27, x19\n"
+ "bne 19b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "tbz %x[flags], #1, 25f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -503,79 +493,69 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmax z13.s, p5/M, z13.s, z1.s\n"
"fmax z14.s, p5/M, z14.s, z1.s\n"
"fmax z15.s, p5/M, z15.s, z1.s\n"
- "27:" // Height 2: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "28:" // Height 2: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 17b\n"
- "b 86f\n"
- "29:" // Height 3
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 30f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 31f\n"
- "30:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "31:" // Height 3: Column loop
+ "25:" // Height 2: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "26:" // Height 2: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 15b\n"
+ "b 80f\n"
+ "27:" // Height 3
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "28:" // Height 3: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 32f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 29f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
"mov z13.d, z9.d\n"
- "addvl x14, x14, #4\n"
+ "addvl x9, x9, #4\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
"mov z15.d, z11.d\n"
"mov z18.d, z10.d\n"
"mov z19.d, z11.d\n"
- "b 34f\n"
- "32:" // Height 3: no bias
- "tbz %x[flags], #0, 33f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "b 34f\n"
- "33:" // Height 3: no accumulate
+ "b 31f\n"
+ "29:" // Height 3: no bias
+ "tbz %x[flags], #0, 30f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 31f\n"
+ "30:" // Height 3: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -588,201 +568,204 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov z17.b, #0x0\n"
"mov z18.b, #0x0\n"
"mov z19.b, #0x0\n"
- "34:" // Height 3: setup done
- "mov x12, #0x0\n"
- "35:" // Height 3: String loop
+ "31:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 36f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 33f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 37f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 34f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "add x26, x26, x19, LSL #2\n"
- "b 37f\n"
- "36:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "37:" // Height 3: input setup done
- "cmp x11, #0x4\n"
- "ble 39f\n"
- "38:" // Height 3: Multiply loop: Main loop head
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 34f\n"
+ "33:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "34:" // Height 3: input setup done
+ "cmp x26, #0x4\n"
+ "ble 36f\n"
+ "35:" // Height 3: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
- "cmp x11, #0x4\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+ "cmp x26, #0x4\n"
"fmla z13.s, z7.s, z1.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla z17.s, z7.s, z2.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
"fmla z18.s, z6.s, z2.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[0]\n"
"fmla z15.s, z7.s, z1.s[0]\n"
"fmla z19.s, z7.s, z2.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
"fmla z12.s, z6.s, z1.s[1]\n"
"fmla z16.s, z6.s, z2.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
"fmla z17.s, z7.s, z2.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
"fmla z18.s, z6.s, z2.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[1]\n"
"fmla z15.s, z7.s, z1.s[1]\n"
"fmla z19.s, z7.s, z2.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
"fmla z12.s, z6.s, z1.s[2]\n"
"fmla z16.s, z6.s, z2.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
"fmla z17.s, z7.s, z2.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[2]\n"
"fmla z15.s, z7.s, z1.s[2]\n"
"fmla z19.s, z7.s, z2.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
"fmla z16.s, z6.s, z2.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
"fmla z17.s, z7.s, z2.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z18.s, z6.s, z2.s[3]\n"
"fmla z11.s, z7.s, z0.s[3]\n"
"fmla z15.s, z7.s, z1.s[3]\n"
"fmla z19.s, z7.s, z2.s[3]\n"
- "bgt 38b\n"
- "39:" // Height 3: Multiply loop: Single iteration only
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "bgt 35b\n"
+ "36:" // Height 3: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "add x26, x26, #0x10\n"
+ "add x23, x23, #0x10\n"
"fmla z13.s, z7.s, z1.s[0]\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z17.s, z7.s, z2.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
"fmla z18.s, z6.s, z2.s[0]\n"
"fmla z11.s, z7.s, z0.s[0]\n"
"fmla z15.s, z7.s, z1.s[0]\n"
"fmla z19.s, z7.s, z2.s[0]\n"
- "ble 40f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[1]\n"
"fmla z16.s, z6.s, z2.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
"fmla z17.s, z7.s, z2.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
"fmla z18.s, z6.s, z2.s[1]\n"
"fmla z11.s, z7.s, z0.s[1]\n"
"fmla z15.s, z7.s, z1.s[1]\n"
"fmla z19.s, z7.s, z2.s[1]\n"
- "ble 40f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[2]\n"
"fmla z16.s, z6.s, z2.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
"fmla z17.s, z7.s, z2.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
"fmla z11.s, z7.s, z0.s[2]\n"
"fmla z15.s, z7.s, z1.s[2]\n"
"fmla z19.s, z7.s, z2.s[2]\n"
- "ble 40f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 37f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
"fmla z16.s, z6.s, z2.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
"fmla z17.s, z7.s, z2.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z18.s, z6.s, z2.s[3]\n"
"fmla z11.s, z7.s, z0.s[3]\n"
"fmla z15.s, z7.s, z1.s[3]\n"
"fmla z19.s, z7.s, z2.s[3]\n"
- "40:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "37:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 35b\n"
- "tbz %x[flags], #1, 41f\n"
+ "cmp x27, x19\n"
+ "bne 32b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "tbz %x[flags], #1, 38f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -811,65 +794,48 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmax z17.s, p5/M, z17.s, z1.s\n"
"fmax z18.s, p5/M, z18.s, z1.s\n"
"fmax z19.s, p5/M, z19.s, z1.s\n"
- "41:" // Height 3: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "42:" // Height 3: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 31b\n"
- "b 86f\n"
- "43:" // Height 4
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 44f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 45f\n"
- "44:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "45:" // Height 4: Column loop
+ "38:" // Height 3: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "39:" // Height 3: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 28b\n"
+ "b 80f\n"
+ "40:" // Height 4
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "41:" // Height 4: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 46f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 42f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
"mov z20.d, z8.d\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z13.d, z9.d\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
@@ -879,27 +845,31 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov z21.d, z9.d\n"
"mov z22.d, z10.d\n"
"mov z23.d, z11.d\n"
- "b 48f\n"
- "46:" // Height 4: no bias
- "tbz %x[flags], #0, 47f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "b 48f\n"
- "47:" // Height 4: no accumulate
+ "b 44f\n"
+ "42:" // Height 4: no bias
+ "tbz %x[flags], #0, 43f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "b 44f\n"
+ "43:" // Height 4: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -916,123 +886,123 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov z21.b, #0x0\n"
"mov z22.b, #0x0\n"
"mov z23.b, #0x0\n"
- "48:" // Height 4: setup done
- "mov x12, #0x0\n"
- "49:" // Height 4: String loop
+ "44:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 50f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 46f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 51f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 47f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
- "b 51f\n"
- "50:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "51:" // Height 4: input setup done
- "cmp x11, #0x4\n"
- "ble 53f\n"
- "52:" // Height 4: Multiply loop: Main loop head
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 47f\n"
+ "46:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "47:" // Height 4: input setup done
+ "cmp x26, #0x4\n"
+ "ble 49f\n"
+ "48:" // Height 4: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
"fmla z13.s, z7.s, z1.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x4\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x4\n"
"fmla z20.s, z6.s, z3.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z17.s, z7.s, z2.s[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z21.s, z7.s, z3.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
"fmla z18.s, z6.s, z2.s[0]\n"
"fmla z22.s, z6.s, z3.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[0]\n"
"fmla z15.s, z7.s, z1.s[0]\n"
"fmla z19.s, z7.s, z2.s[0]\n"
"fmla z23.s, z7.s, z3.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
"fmla z12.s, z6.s, z1.s[1]\n"
"fmla z16.s, z6.s, z2.s[1]\n"
"fmla z20.s, z6.s, z3.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
"fmla z17.s, z7.s, z2.s[1]\n"
"fmla z21.s, z7.s, z3.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
"fmla z18.s, z6.s, z2.s[1]\n"
"fmla z22.s, z6.s, z3.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[1]\n"
"fmla z15.s, z7.s, z1.s[1]\n"
"fmla z19.s, z7.s, z2.s[1]\n"
"fmla z23.s, z7.s, z3.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
"fmla z12.s, z6.s, z1.s[2]\n"
"fmla z16.s, z6.s, z2.s[2]\n"
"fmla z20.s, z6.s, z3.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
"fmla z17.s, z7.s, z2.s[2]\n"
"fmla z21.s, z7.s, z3.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
"fmla z22.s, z6.s, z3.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[2]\n"
"fmla z15.s, z7.s, z1.s[2]\n"
"fmla z19.s, z7.s, z2.s[2]\n"
"fmla z23.s, z7.s, z3.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
"fmla z16.s, z6.s, z2.s[3]\n"
"fmla z20.s, z6.s, z3.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
"fmla z17.s, z7.s, z2.s[3]\n"
"fmla z21.s, z7.s, z3.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z18.s, z6.s, z2.s[3]\n"
@@ -1041,31 +1011,31 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z15.s, z7.s, z1.s[3]\n"
"fmla z19.s, z7.s, z2.s[3]\n"
"fmla z23.s, z7.s, z3.s[3]\n"
- "bgt 52b\n"
- "53:" // Height 4: Multiply loop: Single iteration only
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "bgt 48b\n"
+ "49:" // Height 4: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "add x24, x24, #0x10\n"
+ "add x22, x22, #0x10\n"
"fmla z13.s, z7.s, z1.s[0]\n"
"fmla z17.s, z7.s, z2.s[0]\n"
"fmla z20.s, z6.s, z3.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z21.s, z7.s, z3.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
"fmla z18.s, z6.s, z2.s[0]\n"
@@ -1074,21 +1044,21 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z15.s, z7.s, z1.s[0]\n"
"fmla z19.s, z7.s, z2.s[0]\n"
"fmla z23.s, z7.s, z3.s[0]\n"
- "ble 54f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[1]\n"
"fmla z16.s, z6.s, z2.s[1]\n"
"fmla z20.s, z6.s, z3.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
"fmla z17.s, z7.s, z2.s[1]\n"
"fmla z21.s, z7.s, z3.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
"fmla z18.s, z6.s, z2.s[1]\n"
@@ -1097,21 +1067,21 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z15.s, z7.s, z1.s[1]\n"
"fmla z19.s, z7.s, z2.s[1]\n"
"fmla z23.s, z7.s, z3.s[1]\n"
- "ble 54f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[2]\n"
"fmla z16.s, z6.s, z2.s[2]\n"
"fmla z20.s, z6.s, z3.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
"fmla z17.s, z7.s, z2.s[2]\n"
"fmla z21.s, z7.s, z3.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
@@ -1120,20 +1090,20 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z15.s, z7.s, z1.s[2]\n"
"fmla z19.s, z7.s, z2.s[2]\n"
"fmla z23.s, z7.s, z3.s[2]\n"
- "ble 54f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 50f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
"fmla z16.s, z6.s, z2.s[3]\n"
"fmla z20.s, z6.s, z3.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
"fmla z17.s, z7.s, z2.s[3]\n"
"fmla z21.s, z7.s, z3.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z18.s, z6.s, z2.s[3]\n"
@@ -1142,16 +1112,20 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z15.s, z7.s, z1.s[3]\n"
"fmla z19.s, z7.s, z2.s[3]\n"
"fmla z23.s, z7.s, z3.s[3]\n"
- "54:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "50:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 49b\n"
- "tbz %x[flags], #1, 55f\n"
+ "cmp x27, x19\n"
+ "bne 45b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "tbz %x[flags], #1, 51f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1188,73 +1162,52 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmax z21.s, p5/M, z21.s, z1.s\n"
"fmax z22.s, p5/M, z22.s, z1.s\n"
"fmax z23.s, p5/M, z23.s, z1.s\n"
- "55:" // Height 4: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "56:" // Height 4: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 45b\n"
- "b 86f\n"
- "57:" // Height 5
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 58f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 59f\n"
- "58:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "59:" // Height 5: Column loop
+ "51:" // Height 4: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x22]\n"
+ "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+ "52:" // Height 4: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 41b\n"
+ "b 80f\n"
+ "53:" // Height 5
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "54:" // Height 5: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 60f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 55f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
"mov z20.d, z8.d\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z13.d, z9.d\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
@@ -1268,31 +1221,36 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov z25.d, z9.d\n"
"mov z26.d, z10.d\n"
"mov z27.d, z11.d\n"
- "b 62f\n"
- "60:" // Height 5: no bias
- "tbz %x[flags], #0, 61f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x23]\n"
- "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
- "b 62f\n"
- "61:" // Height 5: no accumulate
+ "b 57f\n"
+ "55:" // Height 5: no bias
+ "tbz %x[flags], #0, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x21]\n"
+ "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 57f\n"
+ "56:" // Height 5: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -1313,143 +1271,143 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
- "62:" // Height 5: setup done
- "mov x12, #0x0\n"
- "63:" // Height 5: String loop
+ "57:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 64f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 59f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 65f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 60f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
"add x22, x22, x19, LSL #2\n"
- "b 65f\n"
- "64:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "add x22, x24, x19, LSL #2\n"
- "65:" // Height 5: input setup done
- "cmp x11, #0x4\n"
- "ble 67f\n"
- "66:" // Height 5: Multiply loop: Main loop head
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 60f\n"
+ "59:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "60:" // Height 5: input setup done
+ "cmp x26, #0x4\n"
+ "ble 62f\n"
+ "61:" // Height 5: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x21, x21, #0x10\n"
"fmla z20.s, z6.s, z3.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x4\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x4\n"
"fmla z24.s, z6.s, z4.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z17.s, z7.s, z2.s[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z21.s, z7.s, z3.s[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"fmla z25.s, z7.s, z4.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
"fmla z18.s, z6.s, z2.s[0]\n"
"fmla z22.s, z6.s, z3.s[0]\n"
"fmla z26.s, z6.s, z4.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[0]\n"
"fmla z15.s, z7.s, z1.s[0]\n"
"fmla z19.s, z7.s, z2.s[0]\n"
"fmla z23.s, z7.s, z3.s[0]\n"
"fmla z27.s, z7.s, z4.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
"fmla z12.s, z6.s, z1.s[1]\n"
"fmla z16.s, z6.s, z2.s[1]\n"
"fmla z20.s, z6.s, z3.s[1]\n"
"fmla z24.s, z6.s, z4.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
"fmla z17.s, z7.s, z2.s[1]\n"
"fmla z21.s, z7.s, z3.s[1]\n"
"fmla z25.s, z7.s, z4.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
"fmla z18.s, z6.s, z2.s[1]\n"
"fmla z22.s, z6.s, z3.s[1]\n"
"fmla z26.s, z6.s, z4.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[1]\n"
"fmla z15.s, z7.s, z1.s[1]\n"
"fmla z19.s, z7.s, z2.s[1]\n"
"fmla z23.s, z7.s, z3.s[1]\n"
"fmla z27.s, z7.s, z4.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
"fmla z12.s, z6.s, z1.s[2]\n"
"fmla z16.s, z6.s, z2.s[2]\n"
"fmla z20.s, z6.s, z3.s[2]\n"
"fmla z24.s, z6.s, z4.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
"fmla z17.s, z7.s, z2.s[2]\n"
"fmla z21.s, z7.s, z3.s[2]\n"
"fmla z25.s, z7.s, z4.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
"fmla z22.s, z6.s, z3.s[2]\n"
"fmla z26.s, z6.s, z4.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[2]\n"
"fmla z15.s, z7.s, z1.s[2]\n"
"fmla z19.s, z7.s, z2.s[2]\n"
"fmla z23.s, z7.s, z3.s[2]\n"
"fmla z27.s, z7.s, z4.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
"fmla z16.s, z6.s, z2.s[3]\n"
"fmla z20.s, z6.s, z3.s[3]\n"
"fmla z24.s, z6.s, z4.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
"fmla z17.s, z7.s, z2.s[3]\n"
"fmla z21.s, z7.s, z3.s[3]\n"
"fmla z25.s, z7.s, z4.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z18.s, z6.s, z2.s[3]\n"
@@ -1460,35 +1418,35 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z19.s, z7.s, z2.s[3]\n"
"fmla z23.s, z7.s, z3.s[3]\n"
"fmla z27.s, z7.s, z4.s[3]\n"
- "bgt 66b\n"
- "67:" // Height 5: Multiply loop: Single iteration only
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "bgt 61b\n"
+ "62:" // Height 5: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "fmla z13.s, z7.s, z1.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "add x21, x21, #0x10\n"
"fmla z17.s, z7.s, z2.s[0]\n"
"fmla z20.s, z6.s, z3.s[0]\n"
"fmla z24.s, z6.s, z4.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z21.s, z7.s, z3.s[0]\n"
"fmla z25.s, z7.s, z4.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
"fmla z18.s, z6.s, z2.s[0]\n"
@@ -1499,23 +1457,23 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z19.s, z7.s, z2.s[0]\n"
"fmla z23.s, z7.s, z3.s[0]\n"
"fmla z27.s, z7.s, z4.s[0]\n"
- "ble 68f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[1]\n"
"fmla z16.s, z6.s, z2.s[1]\n"
"fmla z20.s, z6.s, z3.s[1]\n"
"fmla z24.s, z6.s, z4.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
"fmla z17.s, z7.s, z2.s[1]\n"
"fmla z21.s, z7.s, z3.s[1]\n"
"fmla z25.s, z7.s, z4.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
"fmla z18.s, z6.s, z2.s[1]\n"
@@ -1526,23 +1484,23 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z19.s, z7.s, z2.s[1]\n"
"fmla z23.s, z7.s, z3.s[1]\n"
"fmla z27.s, z7.s, z4.s[1]\n"
- "ble 68f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[2]\n"
"fmla z16.s, z6.s, z2.s[2]\n"
"fmla z20.s, z6.s, z3.s[2]\n"
"fmla z24.s, z6.s, z4.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
"fmla z17.s, z7.s, z2.s[2]\n"
"fmla z21.s, z7.s, z3.s[2]\n"
"fmla z25.s, z7.s, z4.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
@@ -1553,22 +1511,22 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z19.s, z7.s, z2.s[2]\n"
"fmla z23.s, z7.s, z3.s[2]\n"
"fmla z27.s, z7.s, z4.s[2]\n"
- "ble 68f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 63f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
"fmla z16.s, z6.s, z2.s[3]\n"
"fmla z20.s, z6.s, z3.s[3]\n"
"fmla z24.s, z6.s, z4.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
"fmla z17.s, z7.s, z2.s[3]\n"
"fmla z21.s, z7.s, z3.s[3]\n"
"fmla z25.s, z7.s, z4.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z18.s, z6.s, z2.s[3]\n"
@@ -1579,17 +1537,22 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z19.s, z7.s, z2.s[3]\n"
"fmla z23.s, z7.s, z3.s[3]\n"
"fmla z27.s, z7.s, z4.s[3]\n"
- "68:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "63:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 63b\n"
- "tbz %x[flags], #1, 69f\n"
+ "cmp x27, x19\n"
+ "bne 58b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "tbz %x[flags], #1, 64f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1634,83 +1597,59 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmax z25.s, p5/M, z25.s, z1.s\n"
"fmax z26.s, p5/M, z26.s, z1.s\n"
"fmax z27.s, p5/M, z27.s, z1.s\n"
- "69:" // Height 5: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "70:" // Height 5: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 59b\n"
- "b 86f\n"
- "71:" // Height 6
- "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x14, %x[bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 72f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 73f\n"
- "72:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "73:" // Height 6: Column loop
+ "64:" // Height 5: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x22]\n"
+ "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x21]\n"
+ "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+ "65:" // Height 5: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 54b\n"
+ "b 80f\n"
+ "66:" // Height 6
+ "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0x18\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "67:" // Height 6: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x16\n"
+ "whilelt p4.s, x19, x11\n"
"incw x19\n"
- "whilelt p3.s, x19, x16\n"
+ "whilelt p3.s, x19, x11\n"
"incw x19\n"
- "whilelt p2.s, x19, x16\n"
+ "whilelt p2.s, x19, x11\n"
"incw x19\n"
- "whilelt p1.s, x19, x16\n"
- "cbz x14, 74f\n"
- "ld1w { z8.s }, p5/Z, [x14]\n"
+ "whilelt p1.s, x19, x11\n"
+ "cbz x9, 68f\n"
+ "ld1w { z8.s }, p5/Z, [x9]\n"
"mov z12.d, z8.d\n"
- "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
"mov z16.d, z8.d\n"
- "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
"mov z20.d, z8.d\n"
- "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"mov z13.d, z9.d\n"
"mov z17.d, z9.d\n"
"mov z14.d, z10.d\n"
@@ -1728,35 +1667,41 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov z29.d, z9.d\n"
"mov z30.d, z10.d\n"
"mov z31.d, z11.d\n"
- "b 76f\n"
- "74:" // Height 6: no bias
- "tbz %x[flags], #0, 75f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x23]\n"
- "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
- "b 76f\n"
- "75:" // Height 6: no accumulate
+ "b 70f\n"
+ "68:" // Height 6: no bias
+ "tbz %x[flags], #0, 69f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x24]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x23]\n"
+ "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x22]\n"
+ "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x21]\n"
+ "ld1w { z25.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x20]\n"
+ "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "b 70f\n"
+ "69:" // Height 6: no accumulate
"mov z8.b, #0x0\n"
"mov z9.b, #0x0\n"
"mov z10.b, #0x0\n"
@@ -1781,77 +1726,77 @@ void sve_hybrid_fp32_mla_6x4VL (
"mov z29.b, #0x0\n"
"mov z30.b, #0x0\n"
"mov z31.b, #0x0\n"
- "76:" // Height 6: setup done
- "mov x12, #0x0\n"
- "77:" // Height 6: String loop
+ "70:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 78f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 72f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 79f\n"
+ "cbnz x27, 73f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
"add x22, x22, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
"add x20, x20, x19, LSL #2\n"
- "b 79f\n"
- "78:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "add x22, x24, x19, LSL #2\n"
- "add x20, x22, x19, LSL #2\n"
- "79:" // Height 6: input setup done
- "cmp x11, #0x4\n"
- "ble 81f\n"
- "80:" // Height 6: Multiply loop: Main loop head
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "sub x11, x11, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "73:" // Height 6: input setup done
+ "cmp x26, #0x4\n"
+ "ble 75f\n"
+ "74:" // Height 6: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "sub x26, x26, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"fmla z13.s, z7.s, z1.s[0]\n"
"ld1rqw { z5.s }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"fmla z20.s, z6.s, z3.s[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"add x20, x20, #0x10\n"
"fmla z24.s, z6.s, z4.s[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x4\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x4\n"
"fmla z28.s, z6.s, z5.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z17.s, z7.s, z2.s[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z21.s, z7.s, z3.s[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "fmla z25.s, z7.s, z4.s[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z25.s, z7.s, z4.s[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla z29.s, z7.s, z5.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
@@ -1859,85 +1804,85 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z22.s, z6.s, z3.s[0]\n"
"fmla z26.s, z6.s, z4.s[0]\n"
"fmla z30.s, z6.s, z5.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[0]\n"
"fmla z15.s, z7.s, z1.s[0]\n"
"fmla z19.s, z7.s, z2.s[0]\n"
"fmla z23.s, z7.s, z3.s[0]\n"
"fmla z27.s, z7.s, z4.s[0]\n"
"fmla z31.s, z7.s, z5.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
"fmla z12.s, z6.s, z1.s[1]\n"
"fmla z16.s, z6.s, z2.s[1]\n"
"fmla z20.s, z6.s, z3.s[1]\n"
"fmla z24.s, z6.s, z4.s[1]\n"
"fmla z28.s, z6.s, z5.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
"fmla z17.s, z7.s, z2.s[1]\n"
"fmla z21.s, z7.s, z3.s[1]\n"
"fmla z25.s, z7.s, z4.s[1]\n"
"fmla z29.s, z7.s, z5.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
- "addvl x15, x15, #16\n"
+ "ld1w { z7.s }, p5/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
"fmla z18.s, z6.s, z2.s[1]\n"
"fmla z22.s, z6.s, z3.s[1]\n"
"fmla z26.s, z6.s, z4.s[1]\n"
"fmla z30.s, z6.s, z5.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-8, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[1]\n"
"fmla z15.s, z7.s, z1.s[1]\n"
"fmla z19.s, z7.s, z2.s[1]\n"
"fmla z23.s, z7.s, z3.s[1]\n"
"fmla z27.s, z7.s, z4.s[1]\n"
"fmla z31.s, z7.s, z5.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-7, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
"fmla z12.s, z6.s, z1.s[2]\n"
"fmla z16.s, z6.s, z2.s[2]\n"
"fmla z20.s, z6.s, z3.s[2]\n"
"fmla z24.s, z6.s, z4.s[2]\n"
"fmla z28.s, z6.s, z5.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-6, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
"fmla z17.s, z7.s, z2.s[2]\n"
"fmla z21.s, z7.s, z3.s[2]\n"
"fmla z25.s, z7.s, z4.s[2]\n"
"fmla z29.s, z7.s, z5.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-5, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
"fmla z22.s, z6.s, z3.s[2]\n"
"fmla z26.s, z6.s, z4.s[2]\n"
"fmla z30.s, z6.s, z5.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-4, MUL VL]\n"
"fmla z11.s, z7.s, z0.s[2]\n"
"fmla z15.s, z7.s, z1.s[2]\n"
"fmla z19.s, z7.s, z2.s[2]\n"
"fmla z23.s, z7.s, z3.s[2]\n"
"fmla z27.s, z7.s, z4.s[2]\n"
"fmla z31.s, z7.s, z5.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-3, MUL VL]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
"fmla z16.s, z6.s, z2.s[3]\n"
"fmla z20.s, z6.s, z3.s[3]\n"
"fmla z24.s, z6.s, z4.s[3]\n"
"fmla z28.s, z6.s, z5.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #-2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
"fmla z17.s, z7.s, z2.s[3]\n"
"fmla z21.s, z7.s, z3.s[3]\n"
"fmla z25.s, z7.s, z4.s[3]\n"
"fmla z29.s, z7.s, z5.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #-1, MUL VL]\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z18.s, z6.s, z2.s[3]\n"
@@ -1950,39 +1895,39 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z23.s, z7.s, z3.s[3]\n"
"fmla z27.s, z7.s, z4.s[3]\n"
"fmla z31.s, z7.s, z5.s[3]\n"
- "bgt 80b\n"
- "81:" // Height 6: Multiply loop: Single iteration only
- "ld1w { z6.s }, p5/Z, [x15]\n"
- "whilelt p0.s, XZR, x11\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "bgt 74b\n"
+ "75:" // Height 6: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x10]\n"
+ "whilelt p0.s, XZR, x26\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x25]\n"
"fmla z8.s, z6.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z9.s, z7.s, z0.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z12.s, z6.s, z1.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqw { z4.s }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"fmla z13.s, z7.s, z1.s[0]\n"
"ld1rqw { z5.s }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"fmla z20.s, z6.s, z3.s[0]\n"
"add x20, x20, #0x10\n"
"fmla z17.s, z7.s, z2.s[0]\n"
"fmla z24.s, z6.s, z4.s[0]\n"
"fmla z28.s, z6.s, z5.s[0]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z21.s, z7.s, z3.s[0]\n"
"fmla z25.s, z7.s, z4.s[0]\n"
"fmla z29.s, z7.s, z5.s[0]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[0]\n"
"fmla z14.s, z6.s, z1.s[0]\n"
"fmla z18.s, z6.s, z2.s[0]\n"
@@ -1995,25 +1940,25 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z23.s, z7.s, z3.s[0]\n"
"fmla z27.s, z7.s, z4.s[0]\n"
"fmla z31.s, z7.s, z5.s[0]\n"
- "ble 82f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[1]\n"
"fmla z16.s, z6.s, z2.s[1]\n"
"fmla z20.s, z6.s, z3.s[1]\n"
"fmla z24.s, z6.s, z4.s[1]\n"
"fmla z28.s, z6.s, z5.s[1]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[1]\n"
"fmla z13.s, z7.s, z1.s[1]\n"
"fmla z17.s, z7.s, z2.s[1]\n"
"fmla z21.s, z7.s, z3.s[1]\n"
"fmla z25.s, z7.s, z4.s[1]\n"
"fmla z29.s, z7.s, z5.s[1]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[1]\n"
"fmla z14.s, z6.s, z1.s[1]\n"
"fmla z18.s, z6.s, z2.s[1]\n"
@@ -2026,25 +1971,25 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z23.s, z7.s, z3.s[1]\n"
"fmla z27.s, z7.s, z4.s[1]\n"
"fmla z31.s, z7.s, z5.s[1]\n"
- "ble 82f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
- "subs x11, x11, #0x1\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
+ "subs x26, x26, #0x1\n"
"fmla z12.s, z6.s, z1.s[2]\n"
"fmla z16.s, z6.s, z2.s[2]\n"
"fmla z20.s, z6.s, z3.s[2]\n"
"fmla z24.s, z6.s, z4.s[2]\n"
"fmla z28.s, z6.s, z5.s[2]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[2]\n"
"fmla z13.s, z7.s, z1.s[2]\n"
"fmla z17.s, z7.s, z2.s[2]\n"
"fmla z21.s, z7.s, z3.s[2]\n"
"fmla z25.s, z7.s, z4.s[2]\n"
"fmla z29.s, z7.s, z5.s[2]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[2]\n"
"fmla z14.s, z6.s, z1.s[2]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
@@ -2057,24 +2002,24 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z23.s, z7.s, z3.s[2]\n"
"fmla z27.s, z7.s, z4.s[2]\n"
"fmla z31.s, z7.s, z5.s[2]\n"
- "ble 82f\n"
- "ld1w { z6.s }, p5/Z, [x15]\n"
+ "ble 76f\n"
+ "ld1w { z6.s }, p5/Z, [x10]\n"
"fmla z8.s, z6.s, z0.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x10, #1, MUL VL]\n"
"fmla z12.s, z6.s, z1.s[3]\n"
"fmla z16.s, z6.s, z2.s[3]\n"
"fmla z20.s, z6.s, z3.s[3]\n"
"fmla z24.s, z6.s, z4.s[3]\n"
"fmla z28.s, z6.s, z5.s[3]\n"
- "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
"fmla z9.s, z7.s, z0.s[3]\n"
"fmla z13.s, z7.s, z1.s[3]\n"
"fmla z17.s, z7.s, z2.s[3]\n"
"fmla z21.s, z7.s, z3.s[3]\n"
"fmla z25.s, z7.s, z4.s[3]\n"
"fmla z29.s, z7.s, z5.s[3]\n"
- "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
- "addvl x15, x15, #4\n"
+ "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
"fmla z10.s, z6.s, z0.s[3]\n"
"fmla z14.s, z6.s, z1.s[3]\n"
"fmla z18.s, z6.s, z2.s[3]\n"
@@ -2087,18 +2032,24 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmla z23.s, z7.s, z3.s[3]\n"
"fmla z27.s, z7.s, z4.s[3]\n"
"fmla z31.s, z7.s, z5.s[3]\n"
- "82:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "76:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 77b\n"
- "tbz %x[flags], #1, 83f\n"
+ "cmp x27, x19\n"
+ "bne 71b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x24, x28, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "tbz %x[flags], #1, 77f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z1.s }, p5/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -2151,57 +2102,52 @@ void sve_hybrid_fp32_mla_6x4VL (
"fmax z29.s, p5/M, z29.s, z1.s\n"
"fmax z30.s, p5/M, z30.s, z1.s\n"
"fmax z31.s, p5/M, z31.s, z1.s\n"
- "83:" // Height 6: No activation
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "st1w { z28.s }, p4, [x21]\n"
- "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #4\n"
- "84:" // Height 6: Writeback done
- "decw x16, ALL, MUL #4\n"
- "cmp x16, XZR\n"
- "bgt 73b\n"
+ "77:" // Height 6: No activation
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x24]\n"
+ "st1w { z13.s }, p3, [x24, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x24, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x24, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x23]\n"
+ "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x22]\n"
+ "st1w { z21.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x21]\n"
+ "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z28.s }, p4, [x20]\n"
+ "st1w { z29.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x20, #3, MUL VL]\n"
+ "78:" // Height 6: Writeback done
+ "decw x11, ALL, MUL #4\n"
+ "cmp x11, XZR\n"
+ "bgt 67b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 86f\n"
+ "beq 80f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 85f\n"
+ "tbz %x[flags], #3, 79f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "85:" // Update direct input
+ "79:" // Update direct input
"mov x19, #0x18\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "86:" // Exit
+ "80:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index 5238a9ba12..2273d97d5f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -36,7 +36,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void sve_hybrid_fp32_mla_8x1VL( ARGLIST );
@@ -73,7 +72,6 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_fp32_mla_8x1VL;
-
cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
index 0e45b06765..863325f7f5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -27,6 +27,7 @@
#include "../../utils.hpp"
#include <cassert>
+#include <limits>
namespace arm_gemm {
@@ -94,230 +95,219 @@ void sve_hybrid_fp32_mla_8x1VL (
"ptrue p2.b\n"
"1:" // Row loop
"cmp %x[M], #0x8\n"
- "bge 99f\n"
+ "bge 92f\n"
"cmp %x[M], #0x6\n"
- "bgt 85f\n"
- "beq 71f\n"
+ "bgt 79f\n"
+ "beq 66f\n"
"cmp %x[M], #0x4\n"
- "bgt 57f\n"
- "beq 43f\n"
+ "bgt 53f\n"
+ "beq 40f\n"
"cmp %x[M], #0x2\n"
- "bgt 29f\n"
- "beq 15f\n"
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x8, %x[bias]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x17, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "bgt 27f\n"
+ "beq 14f\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x11, %x[bias]\n"
+ "mov x10, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"mov x19, #0x0\n"
- "whilelt p1.s, x19, x6\n"
- "cbz x8, 4f\n"
- "ld1w { z24.s }, p2/Z, [x8]\n"
- "addvl x8, x8, #1\n"
- "b 6f\n"
- "4:" // Height 1: no bias
- "tbz %x[flags], #0, 5f\n"
- "ld1w { z24.s }, p1/Z, [x17]\n"
- "b 6f\n"
- "5:" // Height 1: no accumulate
+ "whilelt p1.s, x19, x13\n"
+ "cbz x11, 3f\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "b 5f\n"
+ "3:" // Height 1: no bias
+ "tbz %x[flags], #0, 4f\n"
+ "ld1w { z24.s }, p1/Z, [x10]\n"
+ "b 5f\n"
+ "4:" // Height 1: no accumulate
"mov z24.b, #0x0\n"
- "6:" // Height 1: setup done
- "mov x16, #0x0\n"
- "7:" // Height 1: String loop
+ "5:" // Height 1: setup done
+ "mov x9, #0x0\n"
+ "6:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 8f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 7f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "cbnz x16, 9f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "cbnz x9, 8f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "b 9f\n"
- "8:" // Height 1: setup direct input
- "mov x14, %x[input_ptr]\n"
- "9:" // Height 1: input setup done
- "cmp x15, #0x4\n"
- "ble 11f\n"
- "10:" // Height 1: Multiply loop: Main loop head
- "ld1w { z8.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
- "sub x15, x15, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 8f\n"
+ "7:" // Height 1: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "8:" // Height 1: input setup done
+ "cmp x28, #0x4\n"
+ "ble 10f\n"
+ "9:" // Height 1: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "sub x28, x28, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
- "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
- "add x14, x14, #0x10\n"
+ "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "add x27, x27, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
- "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
- "cmp x15, #0x4\n"
+ "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "cmp x28, #0x4\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "addvl x7, x7, #4\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "addvl x12, x12, #4\n"
"fmla z24.s, z11.s, z0.s[3]\n"
- "bgt 10b\n"
- "11:" // Height 1: Multiply loop: Single iteration only
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "subs x15, x15, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
- "fmla z24.s, z12.s, z0.s[0]\n"
- "add x14, x14, #0x10\n"
- "addvl x7, x7, #1\n"
- "ble 12f\n"
- "ld1w { z13.s }, p2/Z, [x7]\n"
- "fmla z24.s, z13.s, z0.s[1]\n"
- "subs x15, x15, #0x1\n"
- "addvl x7, x7, #1\n"
- "ble 12f\n"
- "ld1w { z14.s }, p2/Z, [x7]\n"
- "fmla z24.s, z14.s, z0.s[2]\n"
- "subs x15, x15, #0x1\n"
- "addvl x7, x7, #1\n"
- "ble 12f\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
- "fmla z24.s, z15.s, z0.s[3]\n"
- "addvl x7, x7, #1\n"
- "12:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x16, x16, #0x1\n"
+ "bgt 9b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "subs x28, x28, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "add x27, x27, #0x10\n"
+ "addvl x12, x12, #1\n"
+ "ble 11f\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "subs x28, x28, #0x1\n"
+ "addvl x12, x12, #1\n"
+ "ble 11f\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "subs x28, x28, #0x1\n"
+ "addvl x12, x12, #1\n"
+ "ble 11f\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x9, x9, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x16, x19\n"
- "bne 7b\n"
- "tbz %x[flags], #1, 13f\n"
+ "cmp x9, x19\n"
+ "bne 6b\n"
+ "tbz %x[flags], #1, 12f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z17.s }, p2/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
"ld1rw { z16.s }, p2/Z, [x19]\n"
"fmin z24.s, p2/M, z24.s, z16.s\n"
"fmax z24.s, p2/M, z24.s, z17.s\n"
- "13:" // Height 1: No activation
- "st1w { z24.s }, p1, [x17]\n"
- "addvl x17, x17, #1\n"
- "14:" // Height 1: Writeback done
- "decw x6\n"
- "cmp x6, XZR\n"
- "bgt 3b\n"
- "b 114f\n"
- "15:" // Height 2
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 16f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 17f\n"
- "16:" // Height 2: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "17:" // Height 2: Column loop
+ "12:" // Height 1: No activation
+ "st1w { z24.s }, p1, [x10]\n"
+ "addvl x10, x10, #1\n"
+ "13:" // Height 1: Writeback done
+ "decw x13\n"
+ "cmp x13, XZR\n"
+ "bgt 2b\n"
+ "b 106f\n"
+ "14:" // Height 2
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "15:" // Height 2: Column loop
"mov x19, #0x0\n"
- "whilelt p1.s, x19, x6\n"
- "cbz x8, 18f\n"
- "ld1w { z24.s }, p2/Z, [x8]\n"
+ "whilelt p1.s, x19, x13\n"
+ "cbz x11, 16f\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
"mov z25.d, z24.d\n"
- "addvl x8, x8, #1\n"
- "b 20f\n"
- "18:" // Height 2: no bias
- "tbz %x[flags], #0, 19f\n"
- "ld1w { z24.s }, p1/Z, [x17]\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
- "b 20f\n"
- "19:" // Height 2: no accumulate
+ "addvl x11, x11, #1\n"
+ "b 18f\n"
+ "16:" // Height 2: no bias
+ "tbz %x[flags], #0, 17f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z24.s }, p1/Z, [x10]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x26]\n"
+ "b 18f\n"
+ "17:" // Height 2: no accumulate
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
- "20:" // Height 2: setup done
- "mov x16, #0x0\n"
- "21:" // Height 2: String loop
+ "18:" // Height 2: setup done
+ "mov x9, #0x0\n"
+ "19:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 22f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 20f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "cbnz x16, 23f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "cbnz x9, 21f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "b 23f\n"
- "22:" // Height 2: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "23:" // Height 2: input setup done
- "cmp x15, #0x4\n"
- "ble 25f\n"
- "24:" // Height 2: Multiply loop: Main loop head
- "ld1w { z8.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
- "sub x15, x15, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 21f\n"
+ "20:" // Height 2: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "21:" // Height 2: input setup done
+ "cmp x28, #0x4\n"
+ "ble 23f\n"
+ "22:" // Height 2: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "sub x28, x28, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
"fmla z25.s, z8.s, z1.s[0]\n"
- "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
- "add x12, x12, #0x10\n"
+ "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
- "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
- "cmp x15, #0x4\n"
+ "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "addvl x7, x7, #4\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "addvl x12, x12, #4\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z24.s, z11.s, z0.s[3]\n"
"fmla z25.s, z11.s, z1.s[3]\n"
- "bgt 24b\n"
- "25:" // Height 2: Multiply loop: Single iteration only
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "subs x15, x15, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
- "fmla z24.s, z12.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
- "fmla z25.s, z12.s, z1.s[0]\n"
- "add x12, x12, #0x10\n"
- "addvl x7, x7, #1\n"
- "ble 26f\n"
- "ld1w { z13.s }, p2/Z, [x7]\n"
- "fmla z24.s, z13.s, z0.s[1]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z13.s, z1.s[1]\n"
- "addvl x7, x7, #1\n"
- "ble 26f\n"
- "ld1w { z14.s }, p2/Z, [x7]\n"
- "fmla z24.s, z14.s, z0.s[2]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z14.s, z1.s[2]\n"
- "addvl x7, x7, #1\n"
- "ble 26f\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
- "fmla z24.s, z15.s, z0.s[3]\n"
- "addvl x7, x7, #1\n"
- "fmla z25.s, z15.s, z1.s[3]\n"
- "26:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x16, x16, #0x1\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "bgt 22b\n"
+ "23:" // Height 2: Multiply loop: Single iteration only
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "subs x28, x28, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "addvl x12, x12, #1\n"
+ "ble 24f\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
+ "ble 24f\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
+ "ble 24f\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "24:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x9, x9, #0x1\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x16, x19\n"
- "bne 21b\n"
- "tbz %x[flags], #1, 27f\n"
+ "cmp x9, x19\n"
+ "bne 19b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "tbz %x[flags], #1, 25f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z17.s }, p2/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -326,151 +316,144 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmin z25.s, p2/M, z25.s, z16.s\n"
"fmax z24.s, p2/M, z24.s, z17.s\n"
"fmax z25.s, p2/M, z25.s, z17.s\n"
- "27:" // Height 2: No activation
- "st1w { z24.s }, p1, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z25.s }, p1, [x13]\n"
- "addvl x13, x13, #1\n"
- "28:" // Height 2: Writeback done
- "decw x6\n"
- "cmp x6, XZR\n"
- "bgt 17b\n"
- "b 114f\n"
- "29:" // Height 3
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 30f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "add x11, x11, x19, LSL #2\n"
- "b 31f\n"
- "30:" // Height 3: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "31:" // Height 3: Column loop
+ "25:" // Height 2: No activation
+ "st1w { z24.s }, p1, [x10]\n"
+ "addvl x10, x10, #1\n"
+ "st1w { z25.s }, p1, [x26]\n"
+ "26:" // Height 2: Writeback done
+ "decw x13\n"
+ "cmp x13, XZR\n"
+ "bgt 15b\n"
+ "b 106f\n"
+ "27:" // Height 3
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "28:" // Height 3: Column loop
"mov x19, #0x0\n"
- "whilelt p1.s, x19, x6\n"
- "cbz x8, 32f\n"
- "ld1w { z24.s }, p2/Z, [x8]\n"
+ "whilelt p1.s, x19, x13\n"
+ "cbz x11, 29f\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
"mov z25.d, z24.d\n"
- "addvl x8, x8, #1\n"
+ "addvl x11, x11, #1\n"
"mov z26.d, z24.d\n"
- "b 34f\n"
- "32:" // Height 3: no bias
- "tbz %x[flags], #0, 33f\n"
- "ld1w { z24.s }, p1/Z, [x17]\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
- "ld1w { z26.s }, p1/Z, [x11]\n"
- "b 34f\n"
- "33:" // Height 3: no accumulate
+ "b 31f\n"
+ "29:" // Height 3: no bias
+ "tbz %x[flags], #0, 30f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z24.s }, p1/Z, [x10]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x26]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x25]\n"
+ "b 31f\n"
+ "30:" // Height 3: no accumulate
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
- "34:" // Height 3: setup done
- "mov x16, #0x0\n"
- "35:" // Height 3: String loop
+ "31:" // Height 3: setup done
+ "mov x9, #0x0\n"
+ "32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 36f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 33f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "cbnz x16, 37f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "cbnz x9, 34f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "b 37f\n"
- "36:" // Height 3: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "37:" // Height 3: input setup done
- "cmp x15, #0x4\n"
- "ble 39f\n"
- "38:" // Height 3: Multiply loop: Main loop head
- "ld1w { z8.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
- "sub x15, x15, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 34f\n"
+ "33:" // Height 3: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "34:" // Height 3: input setup done
+ "cmp x28, #0x4\n"
+ "ble 36f\n"
+ "35:" // Height 3: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "sub x28, x28, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
"fmla z25.s, z8.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
+ "add x26, x26, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
- "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
- "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
- "cmp x15, #0x4\n"
+ "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "addvl x7, x7, #4\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "addvl x12, x12, #4\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla z26.s, z9.s, z2.s[1]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z24.s, z11.s, z0.s[3]\n"
"fmla z26.s, z10.s, z2.s[2]\n"
"fmla z25.s, z11.s, z1.s[3]\n"
"fmla z26.s, z11.s, z2.s[3]\n"
- "bgt 38b\n"
- "39:" // Height 3: Multiply loop: Single iteration only
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "subs x15, x15, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
- "fmla z24.s, z12.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
- "fmla z25.s, z12.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
- "fmla z26.s, z12.s, z2.s[0]\n"
- "add x10, x10, #0x10\n"
- "addvl x7, x7, #1\n"
- "ble 40f\n"
- "ld1w { z13.s }, p2/Z, [x7]\n"
- "fmla z24.s, z13.s, z0.s[1]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z13.s, z1.s[1]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z13.s, z2.s[1]\n"
- "ble 40f\n"
- "ld1w { z14.s }, p2/Z, [x7]\n"
- "fmla z24.s, z14.s, z0.s[2]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z14.s, z1.s[2]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z14.s, z2.s[2]\n"
- "ble 40f\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
- "fmla z24.s, z15.s, z0.s[3]\n"
- "addvl x7, x7, #1\n"
- "fmla z25.s, z15.s, z1.s[3]\n"
- "fmla z26.s, z15.s, z2.s[3]\n"
- "40:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x16, x16, #0x1\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "bgt 35b\n"
+ "36:" // Height 3: Multiply loop: Single iteration only
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "subs x28, x28, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "add x25, x25, #0x10\n"
+ "addvl x12, x12, #1\n"
+ "ble 37f\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "ble 37f\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ble 37f\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "37:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x9, x9, #0x1\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x16, x19\n"
- "bne 35b\n"
- "tbz %x[flags], #1, 41f\n"
+ "cmp x9, x19\n"
+ "bne 32b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "tbz %x[flags], #1, 38f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z17.s }, p2/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -481,117 +464,104 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmax z24.s, p2/M, z24.s, z17.s\n"
"fmax z25.s, p2/M, z25.s, z17.s\n"
"fmax z26.s, p2/M, z26.s, z17.s\n"
- "41:" // Height 3: No activation
- "st1w { z24.s }, p1, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z25.s }, p1, [x13]\n"
- "addvl x13, x13, #1\n"
- "st1w { z26.s }, p1, [x11]\n"
- "addvl x11, x11, #1\n"
- "42:" // Height 3: Writeback done
- "decw x6\n"
- "cmp x6, XZR\n"
- "bgt 31b\n"
- "b 114f\n"
- "43:" // Height 4
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 44f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "add x11, x11, x19, LSL #2\n"
- "add x9, x9, x19, LSL #2\n"
- "b 45f\n"
- "44:" // Height 4: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "45:" // Height 4: Column loop
+ "38:" // Height 3: No activation
+ "st1w { z24.s }, p1, [x10]\n"
+ "addvl x10, x10, #1\n"
+ "st1w { z25.s }, p1, [x26]\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "39:" // Height 3: Writeback done
+ "decw x13\n"
+ "cmp x13, XZR\n"
+ "bgt 28b\n"
+ "b 106f\n"
+ "40:" // Height 4
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "41:" // Height 4: Column loop
"mov x19, #0x0\n"
- "whilelt p1.s, x19, x6\n"
- "cbz x8, 46f\n"
- "ld1w { z24.s }, p2/Z, [x8]\n"
+ "whilelt p1.s, x19, x13\n"
+ "cbz x11, 42f\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
"mov z25.d, z24.d\n"
- "addvl x8, x8, #1\n"
+ "addvl x11, x11, #1\n"
"mov z26.d, z24.d\n"
"mov z27.d, z24.d\n"
- "b 48f\n"
- "46:" // Height 4: no bias
- "tbz %x[flags], #0, 47f\n"
- "ld1w { z24.s }, p1/Z, [x17]\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
- "ld1w { z26.s }, p1/Z, [x11]\n"
- "ld1w { z27.s }, p1/Z, [x9]\n"
- "b 48f\n"
- "47:" // Height 4: no accumulate
+ "b 44f\n"
+ "42:" // Height 4: no bias
+ "tbz %x[flags], #0, 43f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z24.s }, p1/Z, [x10]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x26]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x25]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x24]\n"
+ "b 44f\n"
+ "43:" // Height 4: no accumulate
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
- "48:" // Height 4: setup done
- "mov x16, #0x0\n"
- "49:" // Height 4: String loop
+ "44:" // Height 4: setup done
+ "mov x9, #0x0\n"
+ "45:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 50f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 46f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "cbnz x16, 51f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x9, 47f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
- "b 51f\n"
- "50:" // Height 4: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "51:" // Height 4: input setup done
- "cmp x15, #0x4\n"
- "ble 53f\n"
- "52:" // Height 4: Multiply loop: Main loop head
- "ld1w { z8.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
- "sub x15, x15, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 47f\n"
+ "46:" // Height 4: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "47:" // Height 4: input setup done
+ "cmp x28, #0x4\n"
+ "ble 49f\n"
+ "48:" // Height 4: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "sub x28, x28, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
"fmla z25.s, z8.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
+ "add x26, x26, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
- "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
- "add x28, x28, #0x10\n"
+ "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
- "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
- "cmp x15, #0x4\n"
+ "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "cmp x28, #0x4\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "addvl x7, x7, #4\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "addvl x12, x12, #4\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z24.s, z11.s, z0.s[3]\n"
@@ -600,57 +570,61 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z25.s, z11.s, z1.s[3]\n"
"fmla z26.s, z11.s, z2.s[3]\n"
"fmla z27.s, z11.s, z3.s[3]\n"
- "bgt 52b\n"
- "53:" // Height 4: Multiply loop: Single iteration only
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "subs x15, x15, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
- "fmla z24.s, z12.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
- "fmla z25.s, z12.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
- "fmla z26.s, z12.s, z2.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
- "fmla z27.s, z12.s, z3.s[0]\n"
- "add x28, x28, #0x10\n"
- "addvl x7, x7, #1\n"
- "ble 54f\n"
- "ld1w { z13.s }, p2/Z, [x7]\n"
- "fmla z24.s, z13.s, z0.s[1]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z13.s, z1.s[1]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z13.s, z2.s[1]\n"
- "fmla z27.s, z13.s, z3.s[1]\n"
- "ble 54f\n"
- "ld1w { z14.s }, p2/Z, [x7]\n"
- "fmla z24.s, z14.s, z0.s[2]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z14.s, z1.s[2]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z14.s, z2.s[2]\n"
- "fmla z27.s, z14.s, z3.s[2]\n"
- "ble 54f\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
- "fmla z24.s, z15.s, z0.s[3]\n"
- "addvl x7, x7, #1\n"
- "fmla z25.s, z15.s, z1.s[3]\n"
- "fmla z26.s, z15.s, z2.s[3]\n"
- "fmla z27.s, z15.s, z3.s[3]\n"
- "54:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x16, x16, #0x1\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "bgt 48b\n"
+ "49:" // Height 4: Multiply loop: Single iteration only
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "subs x28, x28, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "addvl x12, x12, #1\n"
+ "ble 50f\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "ble 50f\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "ble 50f\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "50:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x9, x9, #0x1\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x16, x19\n"
- "bne 49b\n"
- "tbz %x[flags], #1, 55f\n"
+ "cmp x9, x19\n"
+ "bne 45b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "tbz %x[flags], #1, 51f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z17.s }, p2/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -663,134 +637,118 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmax z25.s, p2/M, z25.s, z17.s\n"
"fmax z26.s, p2/M, z26.s, z17.s\n"
"fmax z27.s, p2/M, z27.s, z17.s\n"
- "55:" // Height 4: No activation
- "st1w { z24.s }, p1, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z25.s }, p1, [x13]\n"
- "addvl x13, x13, #1\n"
- "st1w { z26.s }, p1, [x11]\n"
- "addvl x11, x11, #1\n"
- "st1w { z27.s }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "56:" // Height 4: Writeback done
- "decw x6\n"
- "cmp x6, XZR\n"
- "bgt 45b\n"
- "b 114f\n"
- "57:" // Height 5
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 58f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "ldr x27, [%x[output_ptr], #0x20]\n"
- "add x11, x11, x19, LSL #2\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 59f\n"
- "58:" // Height 5: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "59:" // Height 5: Column loop
+ "51:" // Height 4: No activation
+ "st1w { z24.s }, p1, [x10]\n"
+ "addvl x10, x10, #1\n"
+ "st1w { z25.s }, p1, [x26]\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "52:" // Height 4: Writeback done
+ "decw x13\n"
+ "cmp x13, XZR\n"
+ "bgt 41b\n"
+ "b 106f\n"
+ "53:" // Height 5
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "54:" // Height 5: Column loop
"mov x19, #0x0\n"
- "whilelt p1.s, x19, x6\n"
- "cbz x8, 60f\n"
- "ld1w { z24.s }, p2/Z, [x8]\n"
+ "whilelt p1.s, x19, x13\n"
+ "cbz x11, 55f\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
"mov z25.d, z24.d\n"
- "addvl x8, x8, #1\n"
+ "addvl x11, x11, #1\n"
"mov z26.d, z24.d\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
- "b 62f\n"
- "60:" // Height 5: no bias
- "tbz %x[flags], #0, 61f\n"
- "ld1w { z24.s }, p1/Z, [x17]\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
- "ld1w { z26.s }, p1/Z, [x11]\n"
- "ld1w { z27.s }, p1/Z, [x9]\n"
- "ld1w { z28.s }, p1/Z, [x27]\n"
- "b 62f\n"
- "61:" // Height 5: no accumulate
+ "b 57f\n"
+ "55:" // Height 5: no bias
+ "tbz %x[flags], #0, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z24.s }, p1/Z, [x10]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x26]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x25]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x24]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x23]\n"
+ "b 57f\n"
+ "56:" // Height 5: no accumulate
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
"mov z28.b, #0x0\n"
- "62:" // Height 5: setup done
- "mov x16, #0x0\n"
- "63:" // Height 5: String loop
+ "57:" // Height 5: setup done
+ "mov x9, #0x0\n"
+ "58:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 64f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 59f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "ldr x26, [x20, #0x20]\n"
- "cbnz x16, 65f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x23, [x20, #0x20]\n"
+ "cbnz x9, 60f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
"add x26, x26, x19, LSL #2\n"
- "b 65f\n"
- "64:" // Height 5: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "65:" // Height 5: input setup done
- "cmp x15, #0x4\n"
- "ble 67f\n"
- "66:" // Height 5: Multiply loop: Main loop head
- "ld1w { z8.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
- "sub x15, x15, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 60f\n"
+ "59:" // Height 5: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "60:" // Height 5: input setup done
+ "cmp x28, #0x4\n"
+ "ble 62f\n"
+ "61:" // Height 5: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "sub x28, x28, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
"fmla z25.s, z8.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
+ "add x26, x26, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
- "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
- "add x26, x26, #0x10\n"
+ "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "add x23, x23, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
- "cmp x15, #0x4\n"
+ "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "cmp x28, #0x4\n"
"fmla z28.s, z8.s, z4.s[0]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "addvl x7, x7, #4\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "addvl x12, x12, #4\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z28.s, z9.s, z4.s[1]\n"
"fmla z26.s, z10.s, z2.s[2]\n"
"fmla z27.s, z10.s, z3.s[2]\n"
@@ -800,64 +758,69 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z26.s, z11.s, z2.s[3]\n"
"fmla z27.s, z11.s, z3.s[3]\n"
"fmla z28.s, z11.s, z4.s[3]\n"
- "bgt 66b\n"
- "67:" // Height 5: Multiply loop: Single iteration only
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "subs x15, x15, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
- "fmla z24.s, z12.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
- "fmla z25.s, z12.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
- "fmla z26.s, z12.s, z2.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
- "fmla z27.s, z12.s, z3.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
- "fmla z28.s, z12.s, z4.s[0]\n"
+ "bgt 61b\n"
+ "62:" // Height 5: Multiply loop: Single iteration only
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "subs x28, x28, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
"add x26, x26, #0x10\n"
- "addvl x7, x7, #1\n"
- "ble 68f\n"
- "ld1w { z13.s }, p2/Z, [x7]\n"
- "fmla z24.s, z13.s, z0.s[1]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z13.s, z1.s[1]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z13.s, z2.s[1]\n"
- "fmla z27.s, z13.s, z3.s[1]\n"
- "fmla z28.s, z13.s, z4.s[1]\n"
- "ble 68f\n"
- "ld1w { z14.s }, p2/Z, [x7]\n"
- "fmla z24.s, z14.s, z0.s[2]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z14.s, z1.s[2]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z14.s, z2.s[2]\n"
- "fmla z27.s, z14.s, z3.s[2]\n"
- "fmla z28.s, z14.s, z4.s[2]\n"
- "ble 68f\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
- "fmla z24.s, z15.s, z0.s[3]\n"
- "addvl x7, x7, #1\n"
- "fmla z25.s, z15.s, z1.s[3]\n"
- "fmla z26.s, z15.s, z2.s[3]\n"
- "fmla z27.s, z15.s, z3.s[3]\n"
- "fmla z28.s, z15.s, z4.s[3]\n"
- "68:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x16, x16, #0x1\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "add x23, x23, #0x10\n"
+ "addvl x12, x12, #1\n"
+ "ble 63f\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "ble 63f\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "ble 63f\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "63:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x9, x9, #0x1\n"
"prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x16, x19\n"
- "bne 63b\n"
- "tbz %x[flags], #1, 69f\n"
+ "cmp x9, x19\n"
+ "bne 58b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "tbz %x[flags], #1, 64f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z17.s }, p2/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -872,150 +835,131 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmax z26.s, p2/M, z26.s, z17.s\n"
"fmax z27.s, p2/M, z27.s, z17.s\n"
"fmax z28.s, p2/M, z28.s, z17.s\n"
- "69:" // Height 5: No activation
- "st1w { z24.s }, p1, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z25.s }, p1, [x13]\n"
- "addvl x13, x13, #1\n"
- "st1w { z26.s }, p1, [x11]\n"
- "addvl x11, x11, #1\n"
- "st1w { z27.s }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "st1w { z28.s }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "70:" // Height 5: Writeback done
- "decw x6\n"
- "cmp x6, XZR\n"
- "bgt 59b\n"
- "b 114f\n"
- "71:" // Height 6
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 72f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "ldr x27, [%x[output_ptr], #0x20]\n"
- "add x11, x11, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x28]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 73f\n"
- "72:" // Height 6: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "73:" // Height 6: Column loop
+ "64:" // Height 5: No activation
+ "st1w { z24.s }, p1, [x10]\n"
+ "addvl x10, x10, #1\n"
+ "st1w { z25.s }, p1, [x26]\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z28.s }, p1, [x23]\n"
+ "65:" // Height 5: Writeback done
+ "decw x13\n"
+ "cmp x13, XZR\n"
+ "bgt 54b\n"
+ "b 106f\n"
+ "66:" // Height 6
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "67:" // Height 6: Column loop
"mov x19, #0x0\n"
- "whilelt p1.s, x19, x6\n"
- "cbz x8, 74f\n"
- "ld1w { z24.s }, p2/Z, [x8]\n"
+ "whilelt p1.s, x19, x13\n"
+ "cbz x11, 68f\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
"mov z25.d, z24.d\n"
- "addvl x8, x8, #1\n"
+ "addvl x11, x11, #1\n"
"mov z26.d, z24.d\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"mov z29.d, z24.d\n"
- "b 76f\n"
- "74:" // Height 6: no bias
- "tbz %x[flags], #0, 75f\n"
- "ld1w { z24.s }, p1/Z, [x17]\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
- "ld1w { z26.s }, p1/Z, [x11]\n"
- "ld1w { z27.s }, p1/Z, [x9]\n"
- "ld1w { z28.s }, p1/Z, [x27]\n"
- "ld1w { z29.s }, p1/Z, [x25]\n"
- "b 76f\n"
- "75:" // Height 6: no accumulate
+ "b 70f\n"
+ "68:" // Height 6: no bias
+ "tbz %x[flags], #0, 69f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z24.s }, p1/Z, [x10]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x26]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x25]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x24]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x23]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x22]\n"
+ "b 70f\n"
+ "69:" // Height 6: no accumulate
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
"mov z27.b, #0x0\n"
"mov z28.b, #0x0\n"
"mov z29.b, #0x0\n"
- "76:" // Height 6: setup done
- "mov x16, #0x0\n"
- "77:" // Height 6: String loop
+ "70:" // Height 6: setup done
+ "mov x9, #0x0\n"
+ "71:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 78f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 72f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "ldr x26, [x20, #0x20]\n"
- "ldr x24, [x20, #0x28]\n"
- "cbnz x16, 79f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x23, [x20, #0x20]\n"
+ "ldr x22, [x20, #0x28]\n"
+ "cbnz x9, 73f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
"add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
- "b 79f\n"
- "78:" // Height 6: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "79:" // Height 6: input setup done
- "cmp x15, #0x4\n"
- "ble 81f\n"
- "80:" // Height 6: Multiply loop: Main loop head
- "ld1w { z8.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
- "sub x15, x15, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "73:" // Height 6: input setup done
+ "cmp x28, #0x4\n"
+ "ble 75f\n"
+ "74:" // Height 6: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "sub x28, x28, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
"fmla z25.s, z8.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
+ "add x26, x26, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
- "add x24, x24, #0x10\n"
+ "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
"fmla z28.s, z8.s, z4.s[0]\n"
- "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
- "cmp x15, #0x4\n"
+ "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "cmp x28, #0x4\n"
"fmla z29.s, z8.s, z5.s[0]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "addvl x7, x7, #4\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "addvl x12, x12, #4\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla z24.s, z10.s, z0.s[2]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla z28.s, z9.s, z4.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z29.s, z9.s, z5.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z26.s, z10.s, z2.s[2]\n"
"fmla z27.s, z10.s, z3.s[2]\n"
@@ -1027,71 +971,77 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z27.s, z11.s, z3.s[3]\n"
"fmla z28.s, z11.s, z4.s[3]\n"
"fmla z29.s, z11.s, z5.s[3]\n"
- "bgt 80b\n"
- "81:" // Height 6: Multiply loop: Single iteration only
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "subs x15, x15, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
- "fmla z24.s, z12.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
- "fmla z25.s, z12.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
- "fmla z26.s, z12.s, z2.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
- "fmla z27.s, z12.s, z3.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
- "fmla z28.s, z12.s, z4.s[0]\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "bgt 74b\n"
+ "75:" // Height 6: Multiply loop: Single iteration only
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "subs x28, x28, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
"add x26, x26, #0x10\n"
- "fmla z29.s, z12.s, z5.s[0]\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "addvl x7, x7, #1\n"
- "ble 82f\n"
- "ld1w { z13.s }, p2/Z, [x7]\n"
- "fmla z24.s, z13.s, z0.s[1]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z13.s, z1.s[1]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z13.s, z2.s[1]\n"
- "fmla z27.s, z13.s, z3.s[1]\n"
- "fmla z28.s, z13.s, z4.s[1]\n"
- "fmla z29.s, z13.s, z5.s[1]\n"
- "ble 82f\n"
- "ld1w { z14.s }, p2/Z, [x7]\n"
- "fmla z24.s, z14.s, z0.s[2]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z14.s, z1.s[2]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z14.s, z2.s[2]\n"
- "fmla z27.s, z14.s, z3.s[2]\n"
- "fmla z28.s, z14.s, z4.s[2]\n"
- "fmla z29.s, z14.s, z5.s[2]\n"
- "ble 82f\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
- "fmla z24.s, z15.s, z0.s[3]\n"
- "addvl x7, x7, #1\n"
- "fmla z25.s, z15.s, z1.s[3]\n"
- "fmla z26.s, z15.s, z2.s[3]\n"
- "fmla z27.s, z15.s, z3.s[3]\n"
- "fmla z28.s, z15.s, z4.s[3]\n"
- "fmla z29.s, z15.s, z5.s[3]\n"
- "82:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x16, x16, #0x1\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "addvl x12, x12, #1\n"
+ "ble 76f\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "fmla z29.s, z9.s, z5.s[1]\n"
+ "ble 76f\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z29.s, z10.s, z5.s[2]\n"
+ "ble 76f\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z29.s, z11.s, z5.s[3]\n"
+ "76:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x9, x9, #0x1\n"
"prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x16, x19\n"
- "bne 77b\n"
- "tbz %x[flags], #1, 83f\n"
+ "cmp x9, x19\n"
+ "bne 71b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "tbz %x[flags], #1, 77f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z17.s }, p2/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1108,77 +1058,55 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmax z28.s, p2/M, z28.s, z17.s\n"
"fmin z29.s, p2/M, z29.s, z16.s\n"
"fmax z29.s, p2/M, z29.s, z17.s\n"
- "83:" // Height 6: No activation
- "st1w { z24.s }, p1, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z25.s }, p1, [x13]\n"
- "addvl x13, x13, #1\n"
- "st1w { z26.s }, p1, [x11]\n"
- "addvl x11, x11, #1\n"
- "st1w { z27.s }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "st1w { z28.s }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "st1w { z29.s }, p1, [x25]\n"
- "addvl x25, x25, #1\n"
- "84:" // Height 6: Writeback done
- "decw x6\n"
- "cmp x6, XZR\n"
- "bgt 73b\n"
- "b 114f\n"
- "85:" // Height 7
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 86f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "ldr x27, [%x[output_ptr], #0x20]\n"
- "add x11, x11, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x28]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x23, [%x[output_ptr], #0x30]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 87f\n"
- "86:" // Height 7: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "87:" // Height 7: Column loop
+ "77:" // Height 6: No activation
+ "st1w { z24.s }, p1, [x10]\n"
+ "addvl x10, x10, #1\n"
+ "st1w { z25.s }, p1, [x26]\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z28.s }, p1, [x23]\n"
+ "st1w { z29.s }, p1, [x22]\n"
+ "78:" // Height 6: Writeback done
+ "decw x13\n"
+ "cmp x13, XZR\n"
+ "bgt 67b\n"
+ "b 106f\n"
+ "79:" // Height 7
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "80:" // Height 7: Column loop
"mov x19, #0x0\n"
- "whilelt p1.s, x19, x6\n"
- "cbz x8, 88f\n"
- "ld1w { z24.s }, p2/Z, [x8]\n"
+ "whilelt p1.s, x19, x13\n"
+ "cbz x11, 81f\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
"mov z25.d, z24.d\n"
- "addvl x8, x8, #1\n"
+ "addvl x11, x11, #1\n"
"mov z26.d, z24.d\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"mov z29.d, z24.d\n"
"mov z30.d, z24.d\n"
- "b 90f\n"
- "88:" // Height 7: no bias
- "tbz %x[flags], #0, 89f\n"
- "ld1w { z24.s }, p1/Z, [x17]\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
- "ld1w { z26.s }, p1/Z, [x11]\n"
- "ld1w { z27.s }, p1/Z, [x9]\n"
- "ld1w { z28.s }, p1/Z, [x27]\n"
- "ld1w { z29.s }, p1/Z, [x25]\n"
- "ld1w { z30.s }, p1/Z, [x23]\n"
- "b 90f\n"
- "89:" // Height 7: no accumulate
+ "b 83f\n"
+ "81:" // Height 7: no bias
+ "tbz %x[flags], #0, 82f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z24.s }, p1/Z, [x10]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x26]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x25]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x24]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x23]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x22]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
+ "b 83f\n"
+ "82:" // Height 7: no accumulate
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
@@ -1186,88 +1114,88 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov z28.b, #0x0\n"
"mov z29.b, #0x0\n"
"mov z30.b, #0x0\n"
- "90:" // Height 7: setup done
- "mov x16, #0x0\n"
- "91:" // Height 7: String loop
+ "83:" // Height 7: setup done
+ "mov x9, #0x0\n"
+ "84:" // Height 7: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 92f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "ldr x26, [x20, #0x20]\n"
- "ldr x24, [x20, #0x28]\n"
- "ldr x22, [x20, #0x30]\n"
- "cbnz x16, 93f\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x23, [x20, #0x20]\n"
+ "ldr x22, [x20, #0x28]\n"
+ "ldr x21, [x20, #0x30]\n"
+ "cbnz x9, 86f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
"add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
"add x22, x22, x19, LSL #2\n"
- "b 93f\n"
- "92:" // Height 7: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "add x22, x24, x19, LSL #2\n"
- "93:" // Height 7: input setup done
- "cmp x15, #0x4\n"
- "ble 95f\n"
- "94:" // Height 7: Multiply loop: Main loop head
- "ld1w { z8.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
- "sub x15, x15, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 86f\n"
+ "85:" // Height 7: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "86:" // Height 7: input setup done
+ "cmp x28, #0x4\n"
+ "ble 88f\n"
+ "87:" // Height 7: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "sub x28, x28, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
"fmla z25.s, z8.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
+ "add x26, x26, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "ld1rqw { z6.s }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "fmla z28.s, z8.s, z4.s[0]\n"
- "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "ld1rqw { z6.s }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"fmla z29.s, z8.s, z5.s[0]\n"
- "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
- "cmp x15, #0x4\n"
+ "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "cmp x28, #0x4\n"
"fmla z30.s, z8.s, z6.s[0]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "addvl x7, x7, #4\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "addvl x12, x12, #4\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla z28.s, z9.s, z4.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla z29.s, z9.s, z5.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z30.s, z9.s, z6.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "fmla z24.s, z10.s, z0.s[2]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
"fmla z26.s, z10.s, z2.s[2]\n"
"fmla z27.s, z10.s, z3.s[2]\n"
@@ -1281,78 +1209,85 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z28.s, z11.s, z4.s[3]\n"
"fmla z29.s, z11.s, z5.s[3]\n"
"fmla z30.s, z11.s, z6.s[3]\n"
- "bgt 94b\n"
- "95:" // Height 7: Multiply loop: Single iteration only
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "subs x15, x15, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
- "fmla z24.s, z12.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
- "fmla z25.s, z12.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
- "fmla z26.s, z12.s, z2.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
- "fmla z27.s, z12.s, z3.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
- "fmla z28.s, z12.s, z4.s[0]\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "bgt 87b\n"
+ "88:" // Height 7: Multiply loop: Single iteration only
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "subs x28, x28, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
"add x26, x26, #0x10\n"
- "fmla z29.s, z12.s, z5.s[0]\n"
- "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "fmla z30.s, z12.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "ld1rqw { z6.s }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
- "addvl x7, x7, #1\n"
- "ble 96f\n"
- "ld1w { z13.s }, p2/Z, [x7]\n"
- "fmla z24.s, z13.s, z0.s[1]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z13.s, z1.s[1]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z13.s, z2.s[1]\n"
- "fmla z27.s, z13.s, z3.s[1]\n"
- "fmla z28.s, z13.s, z4.s[1]\n"
- "fmla z29.s, z13.s, z5.s[1]\n"
- "fmla z30.s, z13.s, z6.s[1]\n"
- "ble 96f\n"
- "ld1w { z14.s }, p2/Z, [x7]\n"
- "fmla z24.s, z14.s, z0.s[2]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z14.s, z1.s[2]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z14.s, z2.s[2]\n"
- "fmla z27.s, z14.s, z3.s[2]\n"
- "fmla z28.s, z14.s, z4.s[2]\n"
- "fmla z29.s, z14.s, z5.s[2]\n"
- "fmla z30.s, z14.s, z6.s[2]\n"
- "ble 96f\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
- "fmla z24.s, z15.s, z0.s[3]\n"
- "addvl x7, x7, #1\n"
- "fmla z25.s, z15.s, z1.s[3]\n"
- "fmla z26.s, z15.s, z2.s[3]\n"
- "fmla z27.s, z15.s, z3.s[3]\n"
- "fmla z28.s, z15.s, z4.s[3]\n"
- "fmla z29.s, z15.s, z5.s[3]\n"
- "fmla z30.s, z15.s, z6.s[3]\n"
- "96:" // Height 7: Multiply loop: multiply skip
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x16, x16, #0x1\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z30.s, z8.s, z6.s[0]\n"
+ "add x21, x21, #0x10\n"
+ "addvl x12, x12, #1\n"
+ "ble 89f\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "fmla z29.s, z9.s, z5.s[1]\n"
+ "fmla z30.s, z9.s, z6.s[1]\n"
+ "ble 89f\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z29.s, z10.s, z5.s[2]\n"
+ "fmla z30.s, z10.s, z6.s[2]\n"
+ "ble 89f\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z29.s, z11.s, z5.s[3]\n"
+ "fmla z30.s, z11.s, z6.s[3]\n"
+ "89:" // Height 7: Multiply loop: multiply skip
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x9, x9, #0x1\n"
"prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x16, x19\n"
- "bne 91b\n"
- "tbz %x[flags], #1, 97f\n"
+ "cmp x9, x19\n"
+ "bne 84b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "tbz %x[flags], #1, 90f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z17.s }, p2/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1371,86 +1306,62 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmin z30.s, p2/M, z30.s, z16.s\n"
"fmax z29.s, p2/M, z29.s, z17.s\n"
"fmax z30.s, p2/M, z30.s, z17.s\n"
- "97:" // Height 7: No activation
- "st1w { z24.s }, p1, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z25.s }, p1, [x13]\n"
- "addvl x13, x13, #1\n"
- "st1w { z26.s }, p1, [x11]\n"
- "addvl x11, x11, #1\n"
- "st1w { z27.s }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "st1w { z28.s }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "st1w { z29.s }, p1, [x25]\n"
- "addvl x25, x25, #1\n"
- "st1w { z30.s }, p1, [x23]\n"
- "addvl x23, x23, #1\n"
- "98:" // Height 7: Writeback done
- "decw x6\n"
- "cmp x6, XZR\n"
- "bgt 87b\n"
- "b 114f\n"
- "99:" // Height 8
- "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x8, %x[bias]\n"
- "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 100f\n"
- "ldr x17, [%x[output_ptr], #0x0]\n"
- "add x17, x17, x19, LSL #2\n"
- "ldr x13, [%x[output_ptr], #0x8]\n"
- "ldr x11, [%x[output_ptr], #0x10]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x18]\n"
- "ldr x27, [%x[output_ptr], #0x20]\n"
- "add x11, x11, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x28]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x23, [%x[output_ptr], #0x30]\n"
- "ldr x21, [%x[output_ptr], #0x38]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add %x[output_ptr], %x[output_ptr], #0x40\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 101f\n"
- "100:" // Height 8: setup direct output
- "mov x17, %x[output_ptr]\n"
- "add x13, x17, x19, LSL #2\n"
- "add x11, x13, x19, LSL #2\n"
- "add x9, x11, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "101:" // Height 8: Column loop
+ "90:" // Height 7: No activation
+ "st1w { z24.s }, p1, [x10]\n"
+ "addvl x10, x10, #1\n"
+ "st1w { z25.s }, p1, [x26]\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z28.s }, p1, [x23]\n"
+ "st1w { z29.s }, p1, [x22]\n"
+ "st1w { z30.s }, p1, [x21]\n"
+ "91:" // Height 7: Writeback done
+ "decw x13\n"
+ "cmp x13, XZR\n"
+ "bgt 80b\n"
+ "b 106f\n"
+ "92:" // Height 8
+ "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x11, %x[bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[output_ptr]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x19, #0x20\n"
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "93:" // Height 8: Column loop
"mov x19, #0x0\n"
- "whilelt p1.s, x19, x6\n"
- "cbz x8, 102f\n"
- "ld1w { z24.s }, p2/Z, [x8]\n"
+ "whilelt p1.s, x19, x13\n"
+ "cbz x11, 94f\n"
+ "ld1w { z24.s }, p2/Z, [x11]\n"
"mov z25.d, z24.d\n"
- "addvl x8, x8, #1\n"
+ "addvl x11, x11, #1\n"
"mov z26.d, z24.d\n"
"mov z27.d, z24.d\n"
"mov z28.d, z24.d\n"
"mov z29.d, z24.d\n"
"mov z30.d, z24.d\n"
"mov z31.d, z24.d\n"
- "b 104f\n"
- "102:" // Height 8: no bias
- "tbz %x[flags], #0, 103f\n"
- "ld1w { z24.s }, p1/Z, [x17]\n"
- "ld1w { z25.s }, p1/Z, [x13]\n"
- "ld1w { z26.s }, p1/Z, [x11]\n"
- "ld1w { z27.s }, p1/Z, [x9]\n"
- "ld1w { z28.s }, p1/Z, [x27]\n"
- "ld1w { z29.s }, p1/Z, [x25]\n"
- "ld1w { z30.s }, p1/Z, [x23]\n"
- "ld1w { z31.s }, p1/Z, [x21]\n"
- "b 104f\n"
- "103:" // Height 8: no accumulate
+ "b 96f\n"
+ "94:" // Height 8: no bias
+ "tbz %x[flags], #0, 95f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z24.s }, p1/Z, [x10]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x26]\n"
+ "add x25, x26, x19, LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x25]\n"
+ "add x24, x25, x19, LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x24]\n"
+ "add x23, x24, x19, LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x23]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x22]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ "b 96f\n"
+ "95:" // Height 8: no accumulate
"mov z24.b, #0x0\n"
"mov z25.b, #0x0\n"
"mov z26.b, #0x0\n"
@@ -1459,94 +1370,94 @@ void sve_hybrid_fp32_mla_8x1VL (
"mov z29.b, #0x0\n"
"mov z30.b, #0x0\n"
"mov z31.b, #0x0\n"
- "104:" // Height 8: setup done
- "mov x16, #0x0\n"
- "105:" // Height 8: String loop
+ "96:" // Height 8: setup done
+ "mov x9, #0x0\n"
+ "97:" // Height 8: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w15, [x20, x16, LSL #0x2]\n"
- "tbz %x[flags], #3, 106f\n"
- "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "ldr w28, [x20, x9, LSL #0x2]\n"
+ "tbz %x[flags], #3, 98f\n"
+ "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x14, [x20, #0x0]\n"
- "ldr x12, [x20, #0x8]\n"
- "ldr x10, [x20, #0x10]\n"
- "ldr x28, [x20, #0x18]\n"
- "ldr x26, [x20, #0x20]\n"
- "ldr x24, [x20, #0x28]\n"
- "ldr x22, [x20, #0x30]\n"
+ "ldr x27, [x20, #0x0]\n"
+ "ldr x26, [x20, #0x8]\n"
+ "ldr x25, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x23, [x20, #0x20]\n"
+ "ldr x22, [x20, #0x28]\n"
+ "ldr x21, [x20, #0x30]\n"
"ldr x20, [x20, #0x38]\n"
- "cbnz x16, 107f\n"
+ "cbnz x9, 99f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x14, x14, x19, LSL #2\n"
- "add x12, x12, x19, LSL #2\n"
- "add x10, x10, x19, LSL #2\n"
- "add x28, x28, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
"add x26, x26, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
"add x24, x24, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
"add x22, x22, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
"add x20, x20, x19, LSL #2\n"
- "b 107f\n"
- "106:" // Height 8: setup direct input
- "mov x14, %x[input_ptr]\n"
- "add x12, x14, x19, LSL #2\n"
- "add x10, x12, x19, LSL #2\n"
- "add x28, x10, x19, LSL #2\n"
- "add x26, x28, x19, LSL #2\n"
- "add x24, x26, x19, LSL #2\n"
- "add x22, x24, x19, LSL #2\n"
- "add x20, x22, x19, LSL #2\n"
- "107:" // Height 8: input setup done
- "cmp x15, #0x4\n"
- "ble 109f\n"
- "108:" // Height 8: Multiply loop: Main loop head
- "ld1w { z8.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
- "sub x15, x15, #0x4\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "b 99f\n"
+ "98:" // Height 8: setup direct input
+ "mov x27, %x[input_ptr]\n"
+ "add x26, x27, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "99:" // Height 8: input setup done
+ "cmp x28, #0x4\n"
+ "ble 101f\n"
+ "100:" // Height 8: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "sub x28, x28, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
"fmla z24.s, z8.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
"fmla z25.s, z8.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
+ "add x26, x26, #0x10\n"
"fmla z24.s, z9.s, z0.s[1]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"fmla z26.s, z8.s, z2.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"fmla z27.s, z8.s, z3.s[0]\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"fmla z25.s, z9.s, z1.s[1]\n"
- "ld1rqw { z6.s }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqw { z6.s }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"fmla z28.s, z8.s, z4.s[0]\n"
"ld1rqw { z7.s }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"fmla z29.s, z8.s, z5.s[0]\n"
- "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
"add x20, x20, #0x10\n"
"fmla z30.s, z8.s, z6.s[0]\n"
- "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
- "cmp x15, #0x4\n"
+ "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "cmp x28, #0x4\n"
"fmla z31.s, z8.s, z7.s[0]\n"
- "prfm pldl1keep, [x14, #0x80]\n"
- "addvl x7, x7, #4\n"
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "addvl x12, x12, #4\n"
"fmla z26.s, z9.s, z2.s[1]\n"
- "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
"fmla z27.s, z9.s, z3.s[1]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"fmla z28.s, z9.s, z4.s[1]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"fmla z29.s, z9.s, z5.s[1]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"fmla z30.s, z9.s, z6.s[1]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "fmla z31.s, z9.s, z7.s[1]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z31.s, z9.s, z7.s[1]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"fmla z24.s, z10.s, z0.s[2]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"fmla z25.s, z10.s, z1.s[2]\n"
@@ -1564,85 +1475,93 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmla z29.s, z11.s, z5.s[3]\n"
"fmla z30.s, z11.s, z6.s[3]\n"
"fmla z31.s, z11.s, z7.s[3]\n"
- "bgt 108b\n"
- "109:" // Height 8: Multiply loop: Single iteration only
- "ld1w { z12.s }, p2/Z, [x7]\n"
- "whilelt p0.s, XZR, x15\n"
- "subs x15, x15, #0x1\n"
- "ld1rqw { z0.s }, p0/Z, [x14]\n"
- "fmla z24.s, z12.s, z0.s[0]\n"
- "ld1rqw { z1.s }, p0/Z, [x12]\n"
- "add x14, x14, #0x10\n"
- "fmla z25.s, z12.s, z1.s[0]\n"
- "ld1rqw { z2.s }, p0/Z, [x10]\n"
- "add x12, x12, #0x10\n"
- "fmla z26.s, z12.s, z2.s[0]\n"
- "ld1rqw { z3.s }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
- "fmla z27.s, z12.s, z3.s[0]\n"
- "ld1rqw { z4.s }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
- "fmla z28.s, z12.s, z4.s[0]\n"
- "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "bgt 100b\n"
+ "101:" // Height 8: Multiply loop: Single iteration only
+ "ld1w { z8.s }, p2/Z, [x12]\n"
+ "whilelt p0.s, XZR, x28\n"
+ "subs x28, x28, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x27]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x25]\n"
"add x26, x26, #0x10\n"
- "fmla z29.s, z12.s, z5.s[0]\n"
- "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x23]\n"
"add x24, x24, #0x10\n"
- "fmla z30.s, z12.s, z6.s[0]\n"
- "ld1rqw { z7.s }, p0/Z, [x20]\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "ld1rqw { z6.s }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
- "fmla z31.s, z12.s, z7.s[0]\n"
+ "fmla z30.s, z8.s, z6.s[0]\n"
+ "ld1rqw { z7.s }, p0/Z, [x20]\n"
+ "add x21, x21, #0x10\n"
+ "fmla z31.s, z8.s, z7.s[0]\n"
"add x20, x20, #0x10\n"
- "addvl x7, x7, #1\n"
- "ble 110f\n"
- "ld1w { z13.s }, p2/Z, [x7]\n"
- "fmla z24.s, z13.s, z0.s[1]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z13.s, z1.s[1]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z13.s, z2.s[1]\n"
- "fmla z27.s, z13.s, z3.s[1]\n"
- "fmla z28.s, z13.s, z4.s[1]\n"
- "fmla z29.s, z13.s, z5.s[1]\n"
- "fmla z30.s, z13.s, z6.s[1]\n"
- "fmla z31.s, z13.s, z7.s[1]\n"
- "ble 110f\n"
- "ld1w { z14.s }, p2/Z, [x7]\n"
- "fmla z24.s, z14.s, z0.s[2]\n"
- "subs x15, x15, #0x1\n"
- "fmla z25.s, z14.s, z1.s[2]\n"
- "addvl x7, x7, #1\n"
- "fmla z26.s, z14.s, z2.s[2]\n"
- "fmla z27.s, z14.s, z3.s[2]\n"
- "fmla z28.s, z14.s, z4.s[2]\n"
- "fmla z29.s, z14.s, z5.s[2]\n"
- "fmla z30.s, z14.s, z6.s[2]\n"
- "fmla z31.s, z14.s, z7.s[2]\n"
- "ble 110f\n"
- "ld1w { z15.s }, p2/Z, [x7]\n"
- "fmla z24.s, z15.s, z0.s[3]\n"
- "addvl x7, x7, #1\n"
- "fmla z25.s, z15.s, z1.s[3]\n"
- "fmla z26.s, z15.s, z2.s[3]\n"
- "fmla z27.s, z15.s, z3.s[3]\n"
- "fmla z28.s, z15.s, z4.s[3]\n"
- "fmla z29.s, z15.s, z5.s[3]\n"
- "fmla z30.s, z15.s, z6.s[3]\n"
- "fmla z31.s, z15.s, z7.s[3]\n"
- "110:" // Height 8: Multiply loop: multiply skip
- "prfm pldl1keep, [x14, #0x80]\n"
- "add x16, x16, #0x1\n"
- "prfm pldl1keep, [x12, #0x80]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "addvl x12, x12, #1\n"
+ "ble 102f\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "fmla z29.s, z9.s, z5.s[1]\n"
+ "fmla z30.s, z9.s, z6.s[1]\n"
+ "fmla z31.s, z9.s, z7.s[1]\n"
+ "ble 102f\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "subs x28, x28, #0x1\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z29.s, z10.s, z5.s[2]\n"
+ "fmla z30.s, z10.s, z6.s[2]\n"
+ "fmla z31.s, z10.s, z7.s[2]\n"
+ "ble 102f\n"
+ "ld1w { z11.s }, p2/Z, [x12]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z29.s, z11.s, z5.s[3]\n"
+ "fmla z30.s, z11.s, z6.s[3]\n"
+ "fmla z31.s, z11.s, z7.s[3]\n"
+ "102:" // Height 8: Multiply loop: multiply skip
+ "prfm pldl1keep, [x27, #0x80]\n"
+ "add x9, x9, #0x1\n"
"prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x16, x19\n"
- "bne 105b\n"
- "tbz %x[flags], #1, 111f\n"
+ "cmp x9, x19\n"
+ "bne 97b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x26, x10, x19, LSL #2\n"
+ "add x25, x26, x19, LSL #2\n"
+ "add x24, x25, x19, LSL #2\n"
+ "add x23, x24, x19, LSL #2\n"
+ "add x22, x23, x19, LSL #2\n"
+ "add x21, x22, x19, LSL #2\n"
+ "add x20, x21, x19, LSL #2\n"
+ "tbz %x[flags], #1, 103f\n"
"add x19, %x[args_ptr], %[offset_min]\n"
"ld1rw { z17.s }, p2/Z, [x19]\n"
"add x19, %x[args_ptr], %[offset_max]\n"
@@ -1663,43 +1582,36 @@ void sve_hybrid_fp32_mla_8x1VL (
"fmax z29.s, p2/M, z29.s, z17.s\n"
"fmax z30.s, p2/M, z30.s, z17.s\n"
"fmax z31.s, p2/M, z31.s, z17.s\n"
- "111:" // Height 8: No activation
- "st1w { z24.s }, p1, [x17]\n"
- "addvl x17, x17, #1\n"
- "st1w { z25.s }, p1, [x13]\n"
- "addvl x13, x13, #1\n"
- "st1w { z26.s }, p1, [x11]\n"
- "addvl x11, x11, #1\n"
- "st1w { z27.s }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "st1w { z28.s }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "st1w { z29.s }, p1, [x25]\n"
- "addvl x25, x25, #1\n"
- "st1w { z30.s }, p1, [x23]\n"
- "addvl x23, x23, #1\n"
- "st1w { z31.s }, p1, [x21]\n"
- "addvl x21, x21, #1\n"
- "112:" // Height 8: Writeback done
- "decw x6\n"
- "cmp x6, XZR\n"
- "bgt 101b\n"
+ "103:" // Height 8: No activation
+ "st1w { z24.s }, p1, [x10]\n"
+ "addvl x10, x10, #1\n"
+ "st1w { z25.s }, p1, [x26]\n"
+ "st1w { z26.s }, p1, [x25]\n"
+ "st1w { z27.s }, p1, [x24]\n"
+ "st1w { z28.s }, p1, [x23]\n"
+ "st1w { z29.s }, p1, [x22]\n"
+ "st1w { z30.s }, p1, [x21]\n"
+ "st1w { z31.s }, p1, [x20]\n"
+ "104:" // Height 8: Writeback done
+ "decw x13\n"
+ "cmp x13, XZR\n"
+ "bgt 93b\n"
"subs %x[M], %x[M], #0x8\n"
- "beq 114f\n"
+ "beq 106f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 113f\n"
+ "tbz %x[flags], #3, 105f\n"
"add x20, x20, #0x8\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "113:" // Update direct input
+ "105:" // Update direct input
"mov x19, #0x20\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "114:" // Exit
+ "106:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "p0", "p1", "p2", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index b4d3f0283d..bc93ced25b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -36,7 +36,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void sve_hybrid_s8qa_dot_4x4VL( ARGLIST );
@@ -73,7 +72,6 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_s8qa_dot_4x4VL;
-
cls_sve_hybrid_s8qa_dot_4x4VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
index 3c778bfe94..50b9ba524d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -81,152 +81,143 @@ void sve_hybrid_s8qa_dot_4x4VL (
"ptrue p2.b\n"
"1:" // Row loop
"cmp %x[M], #0x4\n"
- "bge 46f\n"
+ "bge 43f\n"
"cmp %x[M], #0x2\n"
- "bgt 31f\n"
- "beq 16f\n"
+ "bgt 29f\n"
+ "beq 15f\n"
"mov z11.s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov z12.s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x10, %x[col_bias]\n"
- "mov z13.s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "mov z14.s, #0x0\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"mov z15.b, #0x1\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "add x9, x9, x19\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x9, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[col_bias]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x26, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"mov z16.s, #0x0\n"
"mov x19, #0x0\n"
"mov z17.s, #0x0\n"
- "whilelt p1.b, x19, x12\n"
+ "whilelt p1.b, x19, x9\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
- "4:" // Height 1: setup done
- "mov x28, #0x0\n"
- "5:" // Height 1: String loop
+ "3:" // Height 1: setup done
+ "mov x25, #0x0\n"
+ "4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 6f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 5f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "cbnz x28, 7f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "cbnz x25, 6f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "b 7f\n"
- "6:" // Height 1: setup direct input
- "mov x26, %x[input_ptr]\n"
- "7:" // Height 1: input setup done
- "cmp x27, #0x10\n"
- "ble 10f\n"
- "8:" // Height 1: Multiply loop: Main loop head
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "add x23, x23, x19\n"
+ "b 6f\n"
+ "5:" // Height 1: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "6:" // Height 1: input setup done
+ "cmp x24, #0x10\n"
+ "ble 9f\n"
+ "7:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"sdot z16.s, z4.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "add x26, x26, #0x10\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x23, #0x10\n"
"sdot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z19.s, z7.b, z0.b[0]\n"
"sdot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z17.s, z9.b, z0.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z18.s, z10.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z19.s, z4.b, z0.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z17.s, z6.b, z0.b[2]\n"
- "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z18.s, z7.b, z0.b[2]\n"
- "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z19.s, z8.b, z0.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z16.s, z9.b, z0.b[3]\n"
"sdot z17.s, z10.b, z0.b[3]\n"
"sdot z18.s, z4.b, z0.b[3]\n"
"sdot z19.s, z5.b, z0.b[3]\n"
- "tbnz %x[flags], #31, 9f\n"
+ "tbnz %x[flags], #31, 8f\n"
"sdot z11.s, z0.b, z15.b\n"
- "9:" // Height 1: Multiply loop: unique 1: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "bgt 8b\n"
- "10:" // Height 1: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "sdot z16.s, z6.b, z0.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
- "add x26, x26, #0x10\n"
- "sdot z17.s, z7.b, z0.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z18.s, z8.b, z0.b[0]\n"
- "sdot z19.s, z9.b, z0.b[0]\n"
- "ble 11f\n"
- "ld1b { z10.b }, p2/Z, [x11]\n"
- "sdot z16.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "sdot z17.s, z4.b, z0.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
- "sdot z18.s, z5.b, z0.b[1]\n"
- "addvl x11, x11, #4\n"
- "sdot z19.s, z6.b, z0.b[1]\n"
- "ble 11f\n"
- "ld1b { z7.b }, p2/Z, [x11]\n"
- "sdot z16.s, z7.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "sdot z17.s, z8.b, z0.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
- "sdot z18.s, z9.b, z0.b[2]\n"
- "addvl x11, x11, #4\n"
- "sdot z19.s, z10.b, z0.b[2]\n"
- "ble 11f\n"
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "sdot z16.s, z4.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "sdot z17.s, z5.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z18.s, z6.b, z0.b[3]\n"
- "sdot z19.s, z7.b, z0.b[3]\n"
- "11:" // Height 1: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 12f\n"
+ "8:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "cmp x24, #0x10\n"
+ "bgt 7b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x23, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "ble 10f\n"
+ "ld1b { z8.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "ble 10f\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "ble 10f\n"
+ "ld1b { z9.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "10:" // Height 1: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 11f\n"
"sdot z11.s, z0.b, z15.b\n"
- "12:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x28, x28, #0x1\n"
+ "11:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x25, x25, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x28, x19\n"
- "bne 5b\n"
- "tbnz %x[flags], #31, 13f\n"
+ "cmp x25, x19\n"
+ "bne 4b\n"
+ "tbnz %x[flags], #31, 12f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z1.s }, p2/Z, [x19]\n"
"neg z1.s, p2/M, z1.s\n"
@@ -235,21 +226,21 @@ void sve_hybrid_s8qa_dot_4x4VL (
"saddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
"mul z11.s, p2/M, z11.s, z1.s\n"
- "13:" // Height 1: skip row sum fixup
+ "12:" // Height 1: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add z18.s, z18.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
"add x19, %x[qp], %[per_layer_mul]\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
"add z16.s, z16.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x23]\n"
"add z17.s, z17.s, z1.s\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"add z18.s, z18.s, z2.s\n"
@@ -258,7 +249,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- "tbz %x[flags], #5, 14f\n"
+ "tbz %x[flags], #5, 13f\n"
"and z4.d, z16.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z17.d, z0.d\n"
@@ -271,7 +262,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sqadd z17.s, z17.s, z5.s\n"
"sqadd z18.s, z18.s, z6.s\n"
"sqadd z19.s, z19.s, z7.s\n"
- "14:" // Height 1: no shift correction
+ "13:" // Height 1: no shift correction
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"add x19, %x[qp], %[c_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
@@ -297,107 +288,96 @@ void sve_hybrid_s8qa_dot_4x4VL (
"uzp1 z16.h, z16.h, z17.h\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "15:" // Height 1: Writeback done
- "decw x12, ALL, MUL #4\n"
- "cmp x12, XZR\n"
- "bgt 3b\n"
- "b 62f\n"
- "16:" // Height 2
+ "st1b { z16.b }, p1, [x26]\n"
+ "addvl x26, x26, #1\n"
+ "14:" // Height 1: Writeback done
+ "decw x9, ALL, MUL #4\n"
+ "cmp x9, XZR\n"
+ "bgt 2b\n"
+ "b 58f\n"
+ "15:" // Height 2
"mov z11.s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"mov z12.s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
- "mov z13.s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "mov z14.s, #0x0\n"
"mov z15.b, #0x1\n"
- "tbz %x[flags], #2, 17f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "add x25, x25, x19\n"
- "b 18f\n"
- "17:" // Height 2: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "18:" // Height 2: Column loop
+ "mov x26, %x[output_ptr]\n"
+ "16:" // Height 2: Column loop
"mov z16.s, #0x0\n"
"mov x19, #0x0\n"
"mov z17.s, #0x0\n"
- "whilelt p1.b, x19, x12\n"
+ "whilelt p1.b, x19, x9\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
- "19:" // Height 2: setup done
- "mov x28, #0x0\n"
- "20:" // Height 2: String loop
+ "17:" // Height 2: setup done
+ "mov x25, #0x0\n"
+ "18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 21f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 19f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "cbnz x28, 22f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "cbnz x25, 20f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
- "b 22f\n"
- "21:" // Height 2: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "22:" // Height 2: input setup done
- "cmp x27, #0x10\n"
- "ble 25f\n"
- "23:" // Height 2: Multiply loop: Main loop head
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 20f\n"
+ "19:" // Height 2: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "20:" // Height 2: input setup done
+ "cmp x24, #0x10\n"
+ "ble 23f\n"
+ "21:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"sdot z16.s, z4.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "add x24, x24, #0x10\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
"sdot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z21.s, z5.b, z1.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
"sdot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z19.s, z7.b, z0.b[0]\n"
- "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z23.s, z7.b, z1.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
"sdot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z20.s, z8.b, z1.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z17.s, z9.b, z0.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z21.s, z9.b, z1.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z18.s, z10.b, z0.b[1]\n"
"sdot z22.s, z10.b, z1.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z19.s, z4.b, z0.b[1]\n"
"sdot z23.s, z4.b, z1.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z16.s, z5.b, z0.b[2]\n"
"sdot z20.s, z5.b, z1.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z17.s, z6.b, z0.b[2]\n"
"sdot z21.s, z6.b, z1.b[2]\n"
"sdot z18.s, z7.b, z0.b[2]\n"
@@ -412,91 +392,93 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sdot z22.s, z4.b, z1.b[3]\n"
"sdot z19.s, z5.b, z0.b[3]\n"
"sdot z23.s, z5.b, z1.b[3]\n"
- "tbnz %x[flags], #31, 24f\n"
+ "tbnz %x[flags], #31, 22f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z12.s, z1.b, z15.b\n"
- "24:" // Height 2: Multiply loop: unique 3: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x10\n"
- "bgt 23b\n"
- "25:" // Height 2: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "sdot z16.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
- "sdot z17.s, z7.b, z0.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
- "add x24, x24, #0x10\n"
- "sdot z20.s, z6.b, z1.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z21.s, z7.b, z1.b[0]\n"
- "sdot z18.s, z8.b, z0.b[0]\n"
- "sdot z22.s, z8.b, z1.b[0]\n"
- "sdot z19.s, z9.b, z0.b[0]\n"
- "sdot z23.s, z9.b, z1.b[0]\n"
- "ble 26f\n"
- "ld1b { z10.b }, p2/Z, [x11]\n"
- "sdot z16.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "sdot z20.s, z10.b, z1.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
- "sdot z17.s, z4.b, z0.b[1]\n"
- "addvl x11, x11, #4\n"
- "sdot z21.s, z4.b, z1.b[1]\n"
- "sdot z18.s, z5.b, z0.b[1]\n"
- "sdot z22.s, z5.b, z1.b[1]\n"
- "sdot z19.s, z6.b, z0.b[1]\n"
- "sdot z23.s, z6.b, z1.b[1]\n"
- "ble 26f\n"
- "ld1b { z7.b }, p2/Z, [x11]\n"
- "sdot z16.s, z7.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "sdot z20.s, z7.b, z1.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
- "sdot z17.s, z8.b, z0.b[2]\n"
- "addvl x11, x11, #4\n"
- "sdot z21.s, z8.b, z1.b[2]\n"
- "sdot z18.s, z9.b, z0.b[2]\n"
- "sdot z22.s, z9.b, z1.b[2]\n"
- "sdot z19.s, z10.b, z0.b[2]\n"
- "sdot z23.s, z10.b, z1.b[2]\n"
- "ble 26f\n"
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "sdot z16.s, z4.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "sdot z20.s, z4.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "sdot z17.s, z5.b, z0.b[3]\n"
- "addvl x11, x11, #4\n"
- "sdot z21.s, z5.b, z1.b[3]\n"
- "sdot z18.s, z6.b, z0.b[3]\n"
- "sdot z22.s, z6.b, z1.b[3]\n"
- "sdot z19.s, z7.b, z0.b[3]\n"
- "sdot z23.s, z7.b, z1.b[3]\n"
- "26:" // Height 2: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 27f\n"
+ "22:" // Height 2: Multiply loop: unique 3: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x10\n"
+ "bgt 21b\n"
+ "23:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z21.s, z5.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "sdot z22.s, z6.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "sdot z23.s, z7.b, z1.b[0]\n"
+ "ble 24f\n"
+ "ld1b { z8.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "sdot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z21.s, z9.b, z1.b[1]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z22.s, z10.b, z1.b[1]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z23.s, z4.b, z1.b[1]\n"
+ "ble 24f\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "sdot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z22.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z23.s, z8.b, z1.b[2]\n"
+ "ble 24f\n"
+ "ld1b { z9.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z20.s, z9.b, z1.b[3]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z21.s, z10.b, z1.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z22.s, z4.b, z1.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z23.s, z5.b, z1.b[3]\n"
+ "24:" // Height 2: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 25f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z12.s, z1.b, z15.b\n"
- "27:" // Height 2: Multiply loop: unique 4: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x28, x28, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "25:" // Height 2: Multiply loop: unique 4: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x25, x25, #0x1\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x28, x19\n"
- "bne 20b\n"
- "tbnz %x[flags], #31, 28f\n"
+ "cmp x25, x19\n"
+ "bne 18b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x26, x19\n"
+ "tbnz %x[flags], #31, 26f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z2.s }, p2/Z, [x19]\n"
"neg z2.s, p2/M, z2.s\n"
@@ -508,19 +490,19 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov z12.s, z12.s[0]\n"
"mul z11.s, p2/M, z11.s, z2.s\n"
"mul z12.s, p2/M, z12.s, z2.s\n"
- "28:" // Height 2: skip row sum fixup
+ "26:" // Height 2: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add z18.s, z18.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
"add x19, %x[qp], %[per_layer_mul]\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
"add z20.s, z20.s, z12.s\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"add z21.s, z21.s, z12.s\n"
@@ -531,7 +513,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
"add z18.s, z18.s, z2.s\n"
"add z19.s, z19.s, z3.s\n"
"add z20.s, z20.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x23]\n"
"add z21.s, z21.s, z1.s\n"
"add z22.s, z22.s, z2.s\n"
"add z23.s, z23.s, z3.s\n"
@@ -543,7 +525,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- "tbz %x[flags], #5, 29f\n"
+ "tbz %x[flags], #5, 27f\n"
"and z4.d, z16.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z17.d, z0.d\n"
@@ -568,7 +550,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sqadd z21.s, z21.s, z9.s\n"
"sqadd z22.s, z22.s, z10.s\n"
"sqadd z23.s, z23.s, z4.s\n"
- "29:" // Height 2: no shift correction
+ "27:" // Height 2: no shift correction
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"add x19, %x[qp], %[c_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
@@ -599,9 +581,9 @@ void sve_hybrid_s8qa_dot_4x4VL (
"uzp1 z17.h, z18.h, z19.h\n"
"smax z20.s, p2/M, z20.s, z5.s\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x9]\n"
+ "st1b { z16.b }, p1, [x26]\n"
"add z21.s, z21.s, z4.s\n"
- "addvl x9, x9, #1\n"
+ "addvl x26, x26, #1\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
"smin z21.s, p2/M, z21.s, z6.s\n"
@@ -615,41 +597,27 @@ void sve_hybrid_s8qa_dot_4x4VL (
"smax z23.s, p2/M, z23.s, z5.s\n"
"uzp1 z21.h, z22.h, z23.h\n"
"uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x25]\n"
- "addvl x25, x25, #1\n"
- "30:" // Height 2: Writeback done
- "decw x12, ALL, MUL #4\n"
- "cmp x12, XZR\n"
- "bgt 18b\n"
- "b 62f\n"
- "31:" // Height 3
+ "st1b { z20.b }, p1, [x22]\n"
+ "28:" // Height 2: Writeback done
+ "decw x9, ALL, MUL #4\n"
+ "cmp x9, XZR\n"
+ "bgt 16b\n"
+ "b 58f\n"
+ "29:" // Height 3
"mov z11.s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"mov z12.s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"mov z13.s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "mov z14.s, #0x0\n"
+ "mov x26, %x[output_ptr]\n"
"mov z15.b, #0x1\n"
- "tbz %x[flags], #2, 32f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "ldr x23, [%x[output_ptr], #0x10]\n"
- "add x25, x25, x19\n"
- "add x23, x23, x19\n"
- "b 33f\n"
- "32:" // Height 3: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "add x23, x25, x19\n"
- "33:" // Height 3: Column loop
+ "30:" // Height 3: Column loop
"mov z16.s, #0x0\n"
"mov x19, #0x0\n"
"mov z17.s, #0x0\n"
- "whilelt p1.b, x19, x12\n"
+ "whilelt p1.b, x19, x9\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
@@ -660,83 +628,83 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov z25.s, #0x0\n"
"mov z26.s, #0x0\n"
"mov z27.s, #0x0\n"
- "34:" // Height 3: setup done
- "mov x28, #0x0\n"
- "35:" // Height 3: String loop
+ "31:" // Height 3: setup done
+ "mov x25, #0x0\n"
+ "32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 36f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 33f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "cbnz x28, 37f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
+ "cbnz x25, 34f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 37f\n"
- "36:" // Height 3: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "37:" // Height 3: input setup done
- "cmp x27, #0x10\n"
- "ble 40f\n"
- "38:" // Height 3: Multiply loop: Main loop head
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "add x21, x21, x19\n"
+ "b 34f\n"
+ "33:" // Height 3: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "34:" // Height 3: input setup done
+ "cmp x24, #0x10\n"
+ "ble 37f\n"
+ "35:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"sdot z16.s, z4.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z17.s, z5.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "sdot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"sdot z24.s, z4.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z21.s, z5.b, z1.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z25.s, z5.b, z2.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
"sdot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z26.s, z6.b, z2.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
"sdot z19.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z23.s, z7.b, z1.b[0]\n"
"sdot z27.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z16.s, z8.b, z0.b[1]\n"
"sdot z20.s, z8.b, z1.b[1]\n"
"sdot z24.s, z8.b, z2.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z17.s, z9.b, z0.b[1]\n"
"sdot z21.s, z9.b, z1.b[1]\n"
"sdot z25.s, z9.b, z2.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z18.s, z10.b, z0.b[1]\n"
"sdot z22.s, z10.b, z1.b[1]\n"
"sdot z26.s, z10.b, z2.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z19.s, z4.b, z0.b[1]\n"
"sdot z23.s, z4.b, z1.b[1]\n"
"sdot z27.s, z4.b, z2.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z16.s, z5.b, z0.b[2]\n"
"sdot z20.s, z5.b, z1.b[2]\n"
"sdot z24.s, z5.b, z2.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z17.s, z6.b, z0.b[2]\n"
"sdot z21.s, z6.b, z1.b[2]\n"
"sdot z25.s, z6.b, z2.b[2]\n"
@@ -758,113 +726,116 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sdot z19.s, z5.b, z0.b[3]\n"
"sdot z23.s, z5.b, z1.b[3]\n"
"sdot z27.s, z5.b, z2.b[3]\n"
- "tbnz %x[flags], #31, 39f\n"
+ "tbnz %x[flags], #31, 36f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z12.s, z1.b, z15.b\n"
"sdot z13.s, z2.b, z15.b\n"
- "39:" // Height 3: Multiply loop: unique 5: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x10\n"
+ "36:" // Height 3: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "bgt 38b\n"
- "40:" // Height 3: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "sdot z16.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
- "sdot z17.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "sdot z20.s, z6.b, z1.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "cmp x24, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "bgt 35b\n"
+ "37:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
- "sdot z24.s, z6.b, z2.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z21.s, z7.b, z1.b[0]\n"
- "sdot z25.s, z7.b, z2.b[0]\n"
- "sdot z18.s, z8.b, z0.b[0]\n"
- "sdot z22.s, z8.b, z1.b[0]\n"
- "sdot z26.s, z8.b, z2.b[0]\n"
- "sdot z19.s, z9.b, z0.b[0]\n"
- "sdot z23.s, z9.b, z1.b[0]\n"
- "sdot z27.s, z9.b, z2.b[0]\n"
- "ble 41f\n"
- "ld1b { z10.b }, p2/Z, [x11]\n"
- "sdot z16.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "sdot z20.s, z10.b, z1.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
- "sdot z24.s, z10.b, z2.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z17.s, z4.b, z0.b[1]\n"
- "sdot z21.s, z4.b, z1.b[1]\n"
- "sdot z25.s, z4.b, z2.b[1]\n"
- "sdot z18.s, z5.b, z0.b[1]\n"
- "sdot z22.s, z5.b, z1.b[1]\n"
- "sdot z26.s, z5.b, z2.b[1]\n"
- "sdot z19.s, z6.b, z0.b[1]\n"
- "sdot z23.s, z6.b, z1.b[1]\n"
- "sdot z27.s, z6.b, z2.b[1]\n"
- "ble 41f\n"
- "ld1b { z7.b }, p2/Z, [x11]\n"
- "sdot z16.s, z7.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "sdot z20.s, z7.b, z1.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
- "sdot z24.s, z7.b, z2.b[2]\n"
- "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z17.s, z8.b, z0.b[2]\n"
- "sdot z21.s, z8.b, z1.b[2]\n"
- "sdot z25.s, z8.b, z2.b[2]\n"
- "sdot z18.s, z9.b, z0.b[2]\n"
- "sdot z22.s, z9.b, z1.b[2]\n"
- "sdot z26.s, z9.b, z2.b[2]\n"
- "sdot z19.s, z10.b, z0.b[2]\n"
- "sdot z23.s, z10.b, z1.b[2]\n"
- "sdot z27.s, z10.b, z2.b[2]\n"
- "ble 41f\n"
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "sdot z16.s, z4.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "sdot z20.s, z4.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "sdot z24.s, z4.b, z2.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z17.s, z5.b, z0.b[3]\n"
- "sdot z21.s, z5.b, z1.b[3]\n"
- "sdot z25.s, z5.b, z2.b[3]\n"
- "sdot z18.s, z6.b, z0.b[3]\n"
- "sdot z22.s, z6.b, z1.b[3]\n"
- "sdot z26.s, z6.b, z2.b[3]\n"
- "sdot z19.s, z7.b, z0.b[3]\n"
- "sdot z23.s, z7.b, z1.b[3]\n"
- "sdot z27.s, z7.b, z2.b[3]\n"
- "41:" // Height 3: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 42f\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
+ "sdot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z21.s, z5.b, z1.b[0]\n"
+ "sdot z25.s, z5.b, z2.b[0]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "sdot z22.s, z6.b, z1.b[0]\n"
+ "sdot z26.s, z6.b, z2.b[0]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "sdot z23.s, z7.b, z1.b[0]\n"
+ "sdot z27.s, z7.b, z2.b[0]\n"
+ "ble 38f\n"
+ "ld1b { z8.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "sdot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z8.b, z2.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "sdot z21.s, z9.b, z1.b[1]\n"
+ "sdot z25.s, z9.b, z2.b[1]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z22.s, z10.b, z1.b[1]\n"
+ "sdot z26.s, z10.b, z2.b[1]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z23.s, z4.b, z1.b[1]\n"
+ "sdot z27.s, z4.b, z2.b[1]\n"
+ "ble 38f\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "sdot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z5.b, z2.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z25.s, z6.b, z2.b[2]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z22.s, z7.b, z1.b[2]\n"
+ "sdot z26.s, z7.b, z2.b[2]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z23.s, z8.b, z1.b[2]\n"
+ "sdot z27.s, z8.b, z2.b[2]\n"
+ "ble 38f\n"
+ "ld1b { z9.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z20.s, z9.b, z1.b[3]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z9.b, z2.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z21.s, z10.b, z1.b[3]\n"
+ "sdot z25.s, z10.b, z2.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z22.s, z4.b, z1.b[3]\n"
+ "sdot z26.s, z4.b, z2.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z23.s, z5.b, z1.b[3]\n"
+ "sdot z27.s, z5.b, z2.b[3]\n"
+ "38:" // Height 3: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 39f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z12.s, z1.b, z15.b\n"
"sdot z13.s, z2.b, z15.b\n"
- "42:" // Height 3: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x28, x28, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "39:" // Height 3: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x25, x25, #0x1\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x28, x19\n"
- "bne 35b\n"
- "tbnz %x[flags], #31, 43f\n"
+ "cmp x25, x19\n"
+ "bne 32b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x26, x19\n"
+ "add x21, x22, x19\n"
+ "tbnz %x[flags], #31, 40f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z3.s }, p2/Z, [x19]\n"
"neg z3.s, p2/M, z3.s\n"
@@ -879,19 +850,19 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mul z11.s, p2/M, z11.s, z3.s\n"
"mul z12.s, p2/M, z12.s, z3.s\n"
"mul z13.s, p2/M, z13.s, z3.s\n"
- "43:" // Height 3: skip row sum fixup
+ "40:" // Height 3: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add z18.s, z18.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
"add x19, %x[qp], %[per_layer_mul]\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
"add z20.s, z20.s, z12.s\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"add z21.s, z21.s, z12.s\n"
@@ -910,7 +881,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
"add z22.s, z22.s, z2.s\n"
"add z23.s, z23.s, z3.s\n"
"add z24.s, z24.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x23]\n"
"add z25.s, z25.s, z1.s\n"
"add z26.s, z26.s, z2.s\n"
"add z27.s, z27.s, z3.s\n"
@@ -926,7 +897,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
- "tbz %x[flags], #5, 44f\n"
+ "tbz %x[flags], #5, 41f\n"
"and z4.d, z16.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z17.d, z0.d\n"
@@ -963,7 +934,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
"asr z8.s, z8.s, #0x1f\n"
"sqadd z26.s, z26.s, z7.s\n"
"sqadd z27.s, z27.s, z8.s\n"
- "44:" // Height 3: no shift correction
+ "41:" // Height 3: no shift correction
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"add x19, %x[qp], %[c_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
@@ -994,9 +965,9 @@ void sve_hybrid_s8qa_dot_4x4VL (
"uzp1 z17.h, z18.h, z19.h\n"
"smax z20.s, p2/M, z20.s, z5.s\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x9]\n"
+ "st1b { z16.b }, p1, [x26]\n"
"add z21.s, z21.s, z4.s\n"
- "addvl x9, x9, #1\n"
+ "addvl x26, x26, #1\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
@@ -1019,58 +990,42 @@ void sve_hybrid_s8qa_dot_4x4VL (
"uzp1 z21.h, z22.h, z23.h\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x25]\n"
+ "st1b { z20.b }, p1, [x22]\n"
"add z26.s, z26.s, z4.s\n"
- "addvl x25, x25, #1\n"
- "add z27.s, z27.s, z4.s\n"
"smax z25.s, p2/M, z25.s, z5.s\n"
+ "add z27.s, z27.s, z4.s\n"
"smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
"smax z26.s, p2/M, z26.s, z5.s\n"
"smax z27.s, p2/M, z27.s, z5.s\n"
"uzp1 z25.h, z26.h, z27.h\n"
"uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x23]\n"
- "addvl x23, x23, #1\n"
- "45:" // Height 3: Writeback done
- "decw x12, ALL, MUL #4\n"
- "cmp x12, XZR\n"
- "bgt 33b\n"
- "b 62f\n"
- "46:" // Height 4
+ "st1b { z24.b }, p1, [x21]\n"
+ "42:" // Height 3: Writeback done
+ "decw x9, ALL, MUL #4\n"
+ "cmp x9, XZR\n"
+ "bgt 30b\n"
+ "b 58f\n"
+ "43:" // Height 4
"mov z11.s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"mov z12.s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"mov z13.s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x26, %x[output_ptr]\n"
"mov z14.s, #0x0\n"
+ "mov x19, #0x4\n"
"mov z15.b, #0x1\n"
- "tbz %x[flags], #2, 47f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "ldr x23, [%x[output_ptr], #0x10]\n"
- "ldr x21, [%x[output_ptr], #0x18]\n"
- "add x25, x25, x19\n"
- "add %x[output_ptr], %x[output_ptr], #0x20\n"
- "add x23, x23, x19\n"
- "add x21, x21, x19\n"
- "b 48f\n"
- "47:" // Height 4: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "add x23, x25, x19\n"
- "add x21, x23, x19\n"
- "add %x[output_ptr], x21, x19\n"
- "48:" // Height 4: Column loop
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "44:" // Height 4: Column loop
"mov z16.s, #0x0\n"
"mov x19, #0x0\n"
"mov z17.s, #0x0\n"
- "whilelt p1.b, x19, x12\n"
+ "whilelt p1.b, x19, x9\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
@@ -1085,97 +1040,97 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mov z29.s, #0x0\n"
"mov z30.s, #0x0\n"
"mov z31.s, #0x0\n"
- "49:" // Height 4: setup done
- "mov x28, #0x0\n"
- "50:" // Height 4: String loop
+ "45:" // Height 4: setup done
+ "mov x25, #0x0\n"
+ "46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 51f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 47f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
"ldr x20, [x20, #0x18]\n"
- "cbnz x28, 52f\n"
+ "cbnz x25, 48f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 52f\n"
- "51:" // Height 4: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "52:" // Height 4: input setup done
- "cmp x27, #0x10\n"
- "ble 55f\n"
- "53:" // Height 4: Multiply loop: Main loop head
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "b 48f\n"
+ "47:" // Height 4: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "48:" // Height 4: input setup done
+ "cmp x24, #0x10\n"
+ "ble 51f\n"
+ "49:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"sdot z16.s, z4.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z17.s, z5.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"sdot z20.s, z4.b, z1.b[0]\n"
"ld1rqb { z3.b }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"sdot z24.s, z4.b, z2.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"add x20, x20, #0x10\n"
"sdot z21.s, z5.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z25.s, z5.b, z2.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z28.s, z4.b, z3.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
"sdot z29.s, z5.b, z3.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
"sdot z26.s, z6.b, z2.b[0]\n"
"sdot z30.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z19.s, z7.b, z0.b[0]\n"
"sdot z23.s, z7.b, z1.b[0]\n"
"sdot z27.s, z7.b, z2.b[0]\n"
"sdot z31.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z16.s, z8.b, z0.b[1]\n"
"sdot z20.s, z8.b, z1.b[1]\n"
"sdot z24.s, z8.b, z2.b[1]\n"
"sdot z28.s, z8.b, z3.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z17.s, z9.b, z0.b[1]\n"
"sdot z21.s, z9.b, z1.b[1]\n"
"sdot z25.s, z9.b, z2.b[1]\n"
"sdot z29.s, z9.b, z3.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z18.s, z10.b, z0.b[1]\n"
"sdot z22.s, z10.b, z1.b[1]\n"
"sdot z26.s, z10.b, z2.b[1]\n"
"sdot z30.s, z10.b, z3.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z19.s, z4.b, z0.b[1]\n"
"sdot z23.s, z4.b, z1.b[1]\n"
"sdot z27.s, z4.b, z2.b[1]\n"
"sdot z31.s, z4.b, z3.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z16.s, z5.b, z0.b[2]\n"
"sdot z20.s, z5.b, z1.b[2]\n"
"sdot z24.s, z5.b, z2.b[2]\n"
"sdot z28.s, z5.b, z3.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z17.s, z6.b, z0.b[2]\n"
"sdot z21.s, z6.b, z1.b[2]\n"
"sdot z25.s, z6.b, z2.b[2]\n"
@@ -1204,135 +1159,139 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sdot z23.s, z5.b, z1.b[3]\n"
"sdot z27.s, z5.b, z2.b[3]\n"
"sdot z31.s, z5.b, z3.b[3]\n"
- "tbnz %x[flags], #31, 54f\n"
+ "tbnz %x[flags], #31, 50f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z12.s, z1.b, z15.b\n"
"sdot z13.s, z2.b, z15.b\n"
"sdot z14.s, z3.b, z15.b\n"
- "54:" // Height 4: Multiply loop: unique 7: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x10\n"
+ "50:" // Height 4: Multiply loop: unique 7: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "bgt 53b\n"
- "55:" // Height 4: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "sdot z16.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
- "sdot z17.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "sdot z20.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "bgt 49b\n"
+ "51:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
- "sdot z24.s, z6.b, z2.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x21, x21, #0x10\n"
+ "sdot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"add x20, x20, #0x10\n"
- "sdot z21.s, z7.b, z1.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z28.s, z6.b, z3.b[0]\n"
- "sdot z25.s, z7.b, z2.b[0]\n"
- "sdot z29.s, z7.b, z3.b[0]\n"
- "sdot z18.s, z8.b, z0.b[0]\n"
- "sdot z22.s, z8.b, z1.b[0]\n"
- "sdot z26.s, z8.b, z2.b[0]\n"
- "sdot z30.s, z8.b, z3.b[0]\n"
- "sdot z19.s, z9.b, z0.b[0]\n"
- "sdot z23.s, z9.b, z1.b[0]\n"
- "sdot z27.s, z9.b, z2.b[0]\n"
- "sdot z31.s, z9.b, z3.b[0]\n"
- "ble 56f\n"
- "ld1b { z10.b }, p2/Z, [x11]\n"
- "sdot z16.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "sdot z20.s, z10.b, z1.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
- "sdot z24.s, z10.b, z2.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z28.s, z10.b, z3.b[1]\n"
- "sdot z17.s, z4.b, z0.b[1]\n"
- "sdot z21.s, z4.b, z1.b[1]\n"
- "sdot z25.s, z4.b, z2.b[1]\n"
- "sdot z29.s, z4.b, z3.b[1]\n"
- "sdot z18.s, z5.b, z0.b[1]\n"
- "sdot z22.s, z5.b, z1.b[1]\n"
- "sdot z26.s, z5.b, z2.b[1]\n"
- "sdot z30.s, z5.b, z3.b[1]\n"
- "sdot z19.s, z6.b, z0.b[1]\n"
- "sdot z23.s, z6.b, z1.b[1]\n"
- "sdot z27.s, z6.b, z2.b[1]\n"
- "sdot z31.s, z6.b, z3.b[1]\n"
- "ble 56f\n"
- "ld1b { z7.b }, p2/Z, [x11]\n"
- "sdot z16.s, z7.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "sdot z20.s, z7.b, z1.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
- "sdot z24.s, z7.b, z2.b[2]\n"
- "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z28.s, z7.b, z3.b[2]\n"
- "sdot z17.s, z8.b, z0.b[2]\n"
- "sdot z21.s, z8.b, z1.b[2]\n"
- "sdot z25.s, z8.b, z2.b[2]\n"
- "sdot z29.s, z8.b, z3.b[2]\n"
- "sdot z18.s, z9.b, z0.b[2]\n"
- "sdot z22.s, z9.b, z1.b[2]\n"
- "sdot z26.s, z9.b, z2.b[2]\n"
- "sdot z30.s, z9.b, z3.b[2]\n"
- "sdot z19.s, z10.b, z0.b[2]\n"
- "sdot z23.s, z10.b, z1.b[2]\n"
- "sdot z27.s, z10.b, z2.b[2]\n"
- "sdot z31.s, z10.b, z3.b[2]\n"
- "ble 56f\n"
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "sdot z16.s, z4.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "sdot z20.s, z4.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "sdot z24.s, z4.b, z2.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "sdot z28.s, z4.b, z3.b[3]\n"
- "sdot z17.s, z5.b, z0.b[3]\n"
- "sdot z21.s, z5.b, z1.b[3]\n"
- "sdot z25.s, z5.b, z2.b[3]\n"
- "sdot z29.s, z5.b, z3.b[3]\n"
- "sdot z18.s, z6.b, z0.b[3]\n"
- "sdot z22.s, z6.b, z1.b[3]\n"
- "sdot z26.s, z6.b, z2.b[3]\n"
- "sdot z30.s, z6.b, z3.b[3]\n"
- "sdot z19.s, z7.b, z0.b[3]\n"
- "sdot z23.s, z7.b, z1.b[3]\n"
- "sdot z27.s, z7.b, z2.b[3]\n"
- "sdot z31.s, z7.b, z3.b[3]\n"
- "56:" // Height 4: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 57f\n"
+ "sdot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z28.s, z4.b, z3.b[0]\n"
+ "sdot z25.s, z5.b, z2.b[0]\n"
+ "sdot z29.s, z5.b, z3.b[0]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "sdot z22.s, z6.b, z1.b[0]\n"
+ "sdot z26.s, z6.b, z2.b[0]\n"
+ "sdot z30.s, z6.b, z3.b[0]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "sdot z23.s, z7.b, z1.b[0]\n"
+ "sdot z27.s, z7.b, z2.b[0]\n"
+ "sdot z31.s, z7.b, z3.b[0]\n"
+ "ble 52f\n"
+ "ld1b { z8.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "sdot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z8.b, z2.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z28.s, z8.b, z3.b[1]\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "sdot z21.s, z9.b, z1.b[1]\n"
+ "sdot z25.s, z9.b, z2.b[1]\n"
+ "sdot z29.s, z9.b, z3.b[1]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z22.s, z10.b, z1.b[1]\n"
+ "sdot z26.s, z10.b, z2.b[1]\n"
+ "sdot z30.s, z10.b, z3.b[1]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z23.s, z4.b, z1.b[1]\n"
+ "sdot z27.s, z4.b, z2.b[1]\n"
+ "sdot z31.s, z4.b, z3.b[1]\n"
+ "ble 52f\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "sdot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z5.b, z2.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z28.s, z5.b, z3.b[2]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z25.s, z6.b, z2.b[2]\n"
+ "sdot z29.s, z6.b, z3.b[2]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z22.s, z7.b, z1.b[2]\n"
+ "sdot z26.s, z7.b, z2.b[2]\n"
+ "sdot z30.s, z7.b, z3.b[2]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z23.s, z8.b, z1.b[2]\n"
+ "sdot z27.s, z8.b, z2.b[2]\n"
+ "sdot z31.s, z8.b, z3.b[2]\n"
+ "ble 52f\n"
+ "ld1b { z9.b }, p2/Z, [x28]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sdot z20.s, z9.b, z1.b[3]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "sdot z24.s, z9.b, z2.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "sdot z28.s, z9.b, z3.b[3]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z21.s, z10.b, z1.b[3]\n"
+ "sdot z25.s, z10.b, z2.b[3]\n"
+ "sdot z29.s, z10.b, z3.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z22.s, z4.b, z1.b[3]\n"
+ "sdot z26.s, z4.b, z2.b[3]\n"
+ "sdot z30.s, z4.b, z3.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z23.s, z5.b, z1.b[3]\n"
+ "sdot z27.s, z5.b, z2.b[3]\n"
+ "sdot z31.s, z5.b, z3.b[3]\n"
+ "52:" // Height 4: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 53f\n"
"sdot z11.s, z0.b, z15.b\n"
"sdot z12.s, z1.b, z15.b\n"
"sdot z13.s, z2.b, z15.b\n"
"sdot z14.s, z3.b, z15.b\n"
- "57:" // Height 4: Multiply loop: unique 8: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x28, x28, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "53:" // Height 4: Multiply loop: unique 8: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x25, x25, #0x1\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x28, x19\n"
- "bne 50b\n"
- "tbnz %x[flags], #31, 58f\n"
+ "cmp x25, x19\n"
+ "bne 46b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x26, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "tbnz %x[flags], #31, 54f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"neg z4.s, p2/M, z4.s\n"
@@ -1350,19 +1309,19 @@ void sve_hybrid_s8qa_dot_4x4VL (
"mul z12.s, p2/M, z12.s, z4.s\n"
"mul z13.s, p2/M, z13.s, z4.s\n"
"mul z14.s, p2/M, z14.s, z4.s\n"
- "58:" // Height 4: skip row sum fixup
+ "54:" // Height 4: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add z18.s, z18.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
"add x19, %x[qp], %[per_layer_mul]\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
"add z20.s, z20.s, z12.s\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"add z21.s, z21.s, z12.s\n"
@@ -1389,7 +1348,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
"add z26.s, z26.s, z2.s\n"
"add z27.s, z27.s, z3.s\n"
"add z28.s, z28.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x23]\n"
"add z29.s, z29.s, z1.s\n"
"add z30.s, z30.s, z2.s\n"
"add z31.s, z31.s, z3.s\n"
@@ -1409,7 +1368,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
- "tbz %x[flags], #5, 59f\n"
+ "tbz %x[flags], #5, 55f\n"
"and z4.d, z16.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z17.d, z0.d\n"
@@ -1458,7 +1417,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
"sqadd z29.s, z29.s, z10.s\n"
"sqadd z30.s, z30.s, z4.s\n"
"sqadd z31.s, z31.s, z5.s\n"
- "59:" // Height 4: no shift correction
+ "55:" // Height 4: no shift correction
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"add x19, %x[qp], %[c_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
@@ -1489,9 +1448,9 @@ void sve_hybrid_s8qa_dot_4x4VL (
"uzp1 z17.h, z18.h, z19.h\n"
"smax z20.s, p2/M, z20.s, z5.s\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x9]\n"
+ "st1b { z16.b }, p1, [x26]\n"
"add z21.s, z21.s, z4.s\n"
- "addvl x9, x9, #1\n"
+ "addvl x26, x26, #1\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
@@ -1514,61 +1473,58 @@ void sve_hybrid_s8qa_dot_4x4VL (
"uzp1 z21.h, z22.h, z23.h\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x25]\n"
+ "st1b { z20.b }, p1, [x22]\n"
"add z26.s, z26.s, z4.s\n"
- "addvl x25, x25, #1\n"
- "add z27.s, z27.s, z4.s\n"
"smax z25.s, p2/M, z25.s, z5.s\n"
+ "add z27.s, z27.s, z4.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
"smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
"add z28.s, z28.s, z4.s\n"
"smax z26.s, p2/M, z26.s, z5.s\n"
+ ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
"smax z27.s, p2/M, z27.s, z5.s\n"
"smin z28.s, p2/M, z28.s, z6.s\n"
- ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
+ "add z29.s, z29.s, z4.s\n"
"uzp1 z25.h, z26.h, z27.h\n"
"smax z28.s, p2/M, z28.s, z5.s\n"
- "add z29.s, z29.s, z4.s\n"
"add z30.s, z30.s, z4.s\n"
"uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x23]\n"
+ "st1b { z24.b }, p1, [x21]\n"
"smin z29.s, p2/M, z29.s, z6.s\n"
- "addvl x23, x23, #1\n"
"smin z30.s, p2/M, z30.s, z6.s\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"smax z29.s, p2/M, z29.s, z5.s\n"
- "add z31.s, z31.s, z4.s\n"
"smax z30.s, p2/M, z30.s, z5.s\n"
+ "add z31.s, z31.s, z4.s\n"
"uzp1 z28.h, z28.h, z29.h\n"
"smin z31.s, p2/M, z31.s, z6.s\n"
"smax z31.s, p2/M, z31.s, z5.s\n"
"uzp1 z29.h, z30.h, z31.h\n"
"uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p1, [x21]\n"
- "addvl x21, x21, #1\n"
- "60:" // Height 4: Writeback done
- "decw x12, ALL, MUL #4\n"
- "cmp x12, XZR\n"
- "bgt 48b\n"
+ "st1b { z28.b }, p1, [x20]\n"
+ "56:" // Height 4: Writeback done
+ "decw x9, ALL, MUL #4\n"
+ "cmp x9, XZR\n"
+ "bgt 44b\n"
"subs %x[M], %x[M], #0x4\n"
- "beq 62f\n"
+ "beq 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 61f\n"
+ "tbz %x[flags], #3, 57f\n"
"add x20, x20, #0x4\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "61:" // Update direct input
+ "57:" // Update direct input
"mov x19, #0x4\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "62:" // Exit
+ "58:" // Exit
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
index b69b561cce..61927236ad 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -36,7 +36,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void sve_hybrid_s8qs_dot_6x4VL( ARGLIST );
@@ -73,7 +72,6 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_s8qs_dot_6x4VL;
-
cls_sve_hybrid_s8qs_dot_6x4VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
index 495637bcdd..f901a814f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -88,182 +88,176 @@ void sve_hybrid_s8qs_dot_6x4VL (
"ptrue p2.b\n"
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 71f\n"
+ "bge 66f\n"
"cmp %x[M], #0x4\n"
- "bgt 57f\n"
- "beq 43f\n"
+ "bgt 53f\n"
+ "beq 40f\n"
"cmp %x[M], #0x2\n"
- "bgt 29f\n"
- "beq 15f\n"
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "bgt 27f\n"
+ "beq 14f\n"
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "2:" // Height 1: Column loop
"mov z8.s, #0x0\n"
"mov x19, #0x0\n"
"mov z9.s, #0x0\n"
- "whilelt p1.b, x19, x15\n"
+ "whilelt p1.b, x19, x10\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
- "4:" // Height 1: setup done
- "mov x12, #0x0\n"
- "5:" // Height 1: String loop
+ "3:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 6f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 5f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 7f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 6f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "b 7f\n"
- "6:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "7:" // Height 1: input setup done
- "cmp x11, #0x10\n"
- "ble 9f\n"
- "8:" // Height 1: Multiply loop: Main loop head
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x25, x25, x19\n"
+ "b 6f\n"
+ "5:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "6:" // Height 1: input setup done
+ "cmp x26, #0x10\n"
+ "ble 8f\n"
+ "7:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "cmp x11, #0x10\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "cmp x26, #0x10\n"
"sdot z10.s, z6.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
- "bgt 8b\n"
- "9:" // Height 1: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 7b\n"
+ "8:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
- "ble 10f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 9f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[1]\n"
- "addvl x14, x14, #4\n"
+ "addvl x28, x28, #4\n"
"sdot z11.s, z7.b, z0.b[1]\n"
- "ble 10f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 9f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
- "addvl x14, x14, #4\n"
+ "addvl x28, x28, #4\n"
"sdot z11.s, z7.b, z0.b[2]\n"
- "ble 10f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 9f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
- "10:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
+ "9:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 5b\n"
- "ld1w { z0.s }, p2/Z, [x16]\n"
+ "cmp x27, x19\n"
+ "bne 4b\n"
+ "ld1w { z0.s }, p2/Z, [x11]\n"
"add z8.s, z8.s, z0.s\n"
- "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
"add z9.s, z9.s, z1.s\n"
- "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
- "addvl x16, x16, #4\n"
+ "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
"add z10.s, z10.s, z2.s\n"
"add z11.s, z11.s, z3.s\n"
- "tbz %x[flags], #4, 11f\n"
- "ld1w { z0.s }, p2/Z, [x17]\n"
- "ld1w { z4.s }, p2/Z, [x8]\n"
- "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
- "addvl x17, x17, #4\n"
- "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
- "addvl x8, x8, #4\n"
- "b 12f\n"
- "11:" // Height 1: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "tbz %x[flags], #4, 10f\n"
+ "ld1w { z0.s }, p2/Z, [x12]\n"
+ "ld1w { z4.s }, p2/Z, [x13]\n"
+ "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "addvl x12, x12, #4\n"
+ "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "b 11f\n"
+ "10:" // Height 1: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x24]\n"
"mov z1.d, z0.d\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
"mov z2.d, z0.d\n"
"mov z3.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z6.d, z4.d\n"
"mov z7.d, z4.d\n"
- "12:" // Height 1: parameters loaded
+ "11:" // Height 1: parameters loaded
".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
- "tbz %x[flags], #5, 13f\n"
+ "tbz %x[flags], #5, 12f\n"
"and z4.d, z8.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z9.d, z1.d\n"
@@ -276,17 +270,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z9.s, z9.s, z5.s\n"
"sqadd z10.s, z10.s, z6.s\n"
"sqadd z11.s, z11.s, z7.s\n"
- "13:" // Height 1: no shift correction
+ "12:" // Height 1: no shift correction
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "ld1rw { z5.s }, p2/Z, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1rw { z5.s }, p2/Z, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
- "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "ld1rw { z6.s }, p2/Z, [x24]\n"
"add z8.s, z8.s, z4.s\n"
"add z9.s, z9.s, z4.s\n"
"add z10.s, z10.s, z4.s\n"
@@ -302,230 +296,223 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z8.h, z8.h, z9.h\n"
"uzp1 z9.h, z10.h, z11.h\n"
"uzp1 z8.b, z8.b, z9.b\n"
- "st1b { z8.b }, p1, [x13]\n"
- "addvl x13, x13, #1\n"
- "14:" // Height 1: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 3b\n"
- "b 86f\n"
- "15:" // Height 2
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 16f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "b 17f\n"
- "16:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "17:" // Height 2: Column loop
+ "st1b { z8.b }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "13:" // Height 1: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 2b\n"
+ "b 80f\n"
+ "14:" // Height 2
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "15:" // Height 2: Column loop
"mov z8.s, #0x0\n"
"mov x19, #0x0\n"
"mov z9.s, #0x0\n"
- "whilelt p1.b, x19, x15\n"
+ "whilelt p1.b, x19, x10\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z13.s, #0x0\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
- "18:" // Height 2: setup done
- "mov x12, #0x0\n"
- "19:" // Height 2: String loop
+ "16:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "17:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 20f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 18f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 21f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 19f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "b 21f\n"
- "20:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "21:" // Height 2: input setup done
- "cmp x11, #0x10\n"
- "ble 23f\n"
- "22:" // Height 2: Multiply loop: Main loop head
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "b 19f\n"
+ "18:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "19:" // Height 2: input setup done
+ "cmp x26, #0x10\n"
+ "ble 21f\n"
+ "20:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
- "cmp x11, #0x10\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "cmp x26, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
"sdot z15.s, z7.b, z1.b[3]\n"
- "bgt 22b\n"
- "23:" // Height 2: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 20b\n"
+ "21:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
- "ble 24f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 22f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
- "ble 24f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 22f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
- "ble 24f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 22f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
"sdot z15.s, z7.b, z1.b[3]\n"
- "24:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "22:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 19b\n"
- "ld1w { z0.s }, p2/Z, [x16]\n"
+ "cmp x27, x19\n"
+ "bne 17b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z0.s }, p2/Z, [x11]\n"
"add z8.s, z8.s, z0.s\n"
- "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+ "add x23, x9, x19\n"
"add z12.s, z12.s, z0.s\n"
- "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
"add z9.s, z9.s, z1.s\n"
- "addvl x16, x16, #4\n"
+ "addvl x11, x11, #4\n"
"add z13.s, z13.s, z1.s\n"
"add z10.s, z10.s, z2.s\n"
"add z11.s, z11.s, z3.s\n"
"add z14.s, z14.s, z2.s\n"
"add z15.s, z15.s, z3.s\n"
- "tbz %x[flags], #4, 25f\n"
- "ld1w { z0.s }, p2/Z, [x17]\n"
- "ld1w { z4.s }, p2/Z, [x8]\n"
- "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
- "addvl x17, x17, #4\n"
- "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
- "addvl x8, x8, #4\n"
- "b 26f\n"
- "25:" // Height 2: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "tbz %x[flags], #4, 23f\n"
+ "ld1w { z0.s }, p2/Z, [x12]\n"
+ "ld1w { z4.s }, p2/Z, [x13]\n"
+ "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "addvl x12, x12, #4\n"
+ "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "b 24f\n"
+ "23:" // Height 2: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x24]\n"
"mov z1.d, z0.d\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
"mov z2.d, z0.d\n"
"mov z3.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z6.d, z4.d\n"
"mov z7.d, z4.d\n"
- "26:" // Height 2: parameters loaded
+ "24:" // Height 2: parameters loaded
".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
@@ -534,7 +521,7 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
- "tbz %x[flags], #5, 27f\n"
+ "tbz %x[flags], #5, 25f\n"
"and z4.d, z8.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z9.d, z1.d\n"
@@ -559,17 +546,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z13.s, z13.s, z5.s\n"
"sqadd z14.s, z14.s, z6.s\n"
"sqadd z15.s, z15.s, z7.s\n"
- "27:" // Height 2: no shift correction
+ "25:" // Height 2: no shift correction
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "ld1rw { z5.s }, p2/Z, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1rw { z5.s }, p2/Z, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
- "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "ld1rw { z6.s }, p2/Z, [x24]\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
"add z8.s, z8.s, z4.s\n"
"add z9.s, z9.s, z4.s\n"
@@ -590,9 +577,9 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z9.h, z10.h, z11.h\n"
"smax z12.s, p2/M, z12.s, z5.s\n"
"uzp1 z8.b, z8.b, z9.b\n"
- "st1b { z8.b }, p1, [x13]\n"
+ "st1b { z8.b }, p1, [x9]\n"
"add z13.s, z13.s, z4.s\n"
- "addvl x13, x13, #1\n"
+ "addvl x9, x9, #1\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
"smin z13.s, p2/M, z13.s, z6.s\n"
@@ -606,37 +593,24 @@ void sve_hybrid_s8qs_dot_6x4VL (
"smax z15.s, p2/M, z15.s, z5.s\n"
"uzp1 z13.h, z14.h, z15.h\n"
"uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "28:" // Height 2: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 17b\n"
- "b 86f\n"
- "29:" // Height 3
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 30f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19\n"
- "add x27, x27, x19\n"
- "b 31f\n"
- "30:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "add x27, x9, x19\n"
- "31:" // Height 3: Column loop
+ "st1b { z12.b }, p1, [x23]\n"
+ "26:" // Height 2: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 15b\n"
+ "b 80f\n"
+ "27:" // Height 3
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "28:" // Height 3: Column loop
"mov z8.s, #0x0\n"
"mov x19, #0x0\n"
"mov z9.s, #0x0\n"
- "whilelt p1.b, x19, x15\n"
+ "whilelt p1.b, x19, x10\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
@@ -647,208 +621,211 @@ void sve_hybrid_s8qs_dot_6x4VL (
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
- "32:" // Height 3: setup done
- "mov x12, #0x0\n"
- "33:" // Height 3: String loop
+ "29:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "30:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 34f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 31f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 35f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 32f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
- "b 35f\n"
- "34:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "35:" // Height 3: input setup done
- "cmp x11, #0x10\n"
- "ble 37f\n"
- "36:" // Height 3: Multiply loop: Main loop head
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 32f\n"
+ "31:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "32:" // Height 3: input setup done
+ "cmp x26, #0x10\n"
+ "ble 34f\n"
+ "33:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
- "cmp x11, #0x10\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "cmp x26, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
"sdot z15.s, z7.b, z1.b[3]\n"
"sdot z19.s, z7.b, z2.b[3]\n"
- "bgt 36b\n"
- "37:" // Height 3: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 33b\n"
+ "34:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "add x26, x26, #0x10\n"
+ "add x23, x23, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
- "ble 38f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
- "ble 38f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
- "ble 38f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
"sdot z15.s, z7.b, z1.b[3]\n"
"sdot z19.s, z7.b, z2.b[3]\n"
- "38:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "35:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 33b\n"
- "ld1w { z0.s }, p2/Z, [x16]\n"
+ "cmp x27, x19\n"
+ "bne 30b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z0.s }, p2/Z, [x11]\n"
"add z8.s, z8.s, z0.s\n"
- "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+ "add x23, x9, x19\n"
"add z12.s, z12.s, z0.s\n"
- "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x23, x19\n"
"add z16.s, z16.s, z0.s\n"
- "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
- "addvl x16, x16, #4\n"
+ "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
"add z9.s, z9.s, z1.s\n"
"add z13.s, z13.s, z1.s\n"
"add z10.s, z10.s, z2.s\n"
@@ -858,30 +835,30 @@ void sve_hybrid_s8qs_dot_6x4VL (
"add z17.s, z17.s, z1.s\n"
"add z18.s, z18.s, z2.s\n"
"add z19.s, z19.s, z3.s\n"
- "tbz %x[flags], #4, 39f\n"
- "ld1w { z0.s }, p2/Z, [x17]\n"
- "ld1w { z4.s }, p2/Z, [x8]\n"
- "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
- "addvl x17, x17, #4\n"
- "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
- "addvl x8, x8, #4\n"
- "b 40f\n"
- "39:" // Height 3: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "tbz %x[flags], #4, 36f\n"
+ "ld1w { z0.s }, p2/Z, [x12]\n"
+ "ld1w { z4.s }, p2/Z, [x13]\n"
+ "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "addvl x12, x12, #4\n"
+ "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "b 37f\n"
+ "36:" // Height 3: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x24]\n"
"mov z1.d, z0.d\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
"mov z2.d, z0.d\n"
"mov z3.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z6.d, z4.d\n"
"mov z7.d, z4.d\n"
- "40:" // Height 3: parameters loaded
+ "37:" // Height 3: parameters loaded
".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
@@ -894,7 +871,7 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n"
".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
- "tbz %x[flags], #5, 41f\n"
+ "tbz %x[flags], #5, 38f\n"
"and z4.d, z8.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z9.d, z1.d\n"
@@ -931,17 +908,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z17.s, z17.s, z5.s\n"
"sqadd z18.s, z18.s, z6.s\n"
"sqadd z19.s, z19.s, z7.s\n"
- "41:" // Height 3: no shift correction
+ "38:" // Height 3: no shift correction
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "ld1rw { z5.s }, p2/Z, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1rw { z5.s }, p2/Z, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
- "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "ld1rw { z6.s }, p2/Z, [x24]\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
"add z8.s, z8.s, z4.s\n"
"add z9.s, z9.s, z4.s\n"
@@ -962,9 +939,9 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z9.h, z10.h, z11.h\n"
"smax z12.s, p2/M, z12.s, z5.s\n"
"uzp1 z8.b, z8.b, z9.b\n"
- "st1b { z8.b }, p1, [x13]\n"
+ "st1b { z8.b }, p1, [x9]\n"
"add z13.s, z13.s, z4.s\n"
- "addvl x13, x13, #1\n"
+ "addvl x9, x9, #1\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
@@ -987,52 +964,35 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z13.h, z14.h, z15.h\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
"uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p1, [x9]\n"
+ "st1b { z12.b }, p1, [x23]\n"
"add z18.s, z18.s, z4.s\n"
- "addvl x9, x9, #1\n"
- "add z19.s, z19.s, z4.s\n"
"smax z17.s, p2/M, z17.s, z5.s\n"
+ "add z19.s, z19.s, z4.s\n"
"smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
"smax z18.s, p2/M, z18.s, z5.s\n"
"smax z19.s, p2/M, z19.s, z5.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x27]\n"
- "addvl x27, x27, #1\n"
- "42:" // Height 3: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 31b\n"
- "b 86f\n"
- "43:" // Height 4
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 44f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19\n"
- "add x25, x25, x19\n"
- "b 45f\n"
- "44:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "add x27, x9, x19\n"
- "add x25, x27, x19\n"
- "45:" // Height 4: Column loop
+ "st1b { z16.b }, p1, [x22]\n"
+ "39:" // Height 3: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 28b\n"
+ "b 80f\n"
+ "40:" // Height 4
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "41:" // Height 4: Column loop
"mov z8.s, #0x0\n"
"mov x19, #0x0\n"
"mov z9.s, #0x0\n"
- "whilelt p1.b, x19, x15\n"
+ "whilelt p1.b, x19, x10\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
@@ -1047,123 +1007,123 @@ void sve_hybrid_s8qs_dot_6x4VL (
"mov z21.s, #0x0\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
- "46:" // Height 4: setup done
- "mov x12, #0x0\n"
- "47:" // Height 4: String loop
+ "42:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "43:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 48f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 44f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 49f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 45f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
- "b 49f\n"
- "48:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "49:" // Height 4: input setup done
- "cmp x11, #0x10\n"
- "ble 51f\n"
- "50:" // Height 4: Multiply loop: Main loop head
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "45:" // Height 4: input setup done
+ "cmp x26, #0x10\n"
+ "ble 47f\n"
+ "46:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x10\n"
"sdot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
"sdot z22.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
"sdot z22.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
"sdot z22.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -1172,31 +1132,31 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[3]\n"
"sdot z19.s, z7.b, z2.b[3]\n"
"sdot z23.s, z7.b, z3.b[3]\n"
- "bgt 50b\n"
- "51:" // Height 4: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 46b\n"
+ "47:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "add x24, x24, #0x10\n"
+ "add x22, x22, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
"sdot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
@@ -1205,21 +1165,21 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
- "ble 52f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 48f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
@@ -1228,21 +1188,21 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
- "ble 52f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 48f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
@@ -1251,20 +1211,20 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
- "ble 52f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 48f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -1273,24 +1233,28 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[3]\n"
"sdot z19.s, z7.b, z2.b[3]\n"
"sdot z23.s, z7.b, z3.b[3]\n"
- "52:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "48:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 47b\n"
- "ld1w { z0.s }, p2/Z, [x16]\n"
+ "cmp x27, x19\n"
+ "bne 43b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z0.s }, p2/Z, [x11]\n"
"add z8.s, z8.s, z0.s\n"
- "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+ "add x23, x9, x19\n"
"add z12.s, z12.s, z0.s\n"
- "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x23, x19\n"
"add z16.s, z16.s, z0.s\n"
- "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
- "addvl x16, x16, #4\n"
+ "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+ "add x21, x22, x19\n"
"add z9.s, z9.s, z1.s\n"
+ "addvl x11, x11, #4\n"
"add z13.s, z13.s, z1.s\n"
"add z10.s, z10.s, z2.s\n"
"add z11.s, z11.s, z3.s\n"
@@ -1303,30 +1267,30 @@ void sve_hybrid_s8qs_dot_6x4VL (
"add z21.s, z21.s, z1.s\n"
"add z22.s, z22.s, z2.s\n"
"add z23.s, z23.s, z3.s\n"
- "tbz %x[flags], #4, 53f\n"
- "ld1w { z0.s }, p2/Z, [x17]\n"
- "ld1w { z4.s }, p2/Z, [x8]\n"
- "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
- "addvl x17, x17, #4\n"
- "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
- "addvl x8, x8, #4\n"
- "b 54f\n"
- "53:" // Height 4: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "tbz %x[flags], #4, 49f\n"
+ "ld1w { z0.s }, p2/Z, [x12]\n"
+ "ld1w { z4.s }, p2/Z, [x13]\n"
+ "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "addvl x12, x12, #4\n"
+ "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "b 50f\n"
+ "49:" // Height 4: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x24]\n"
"mov z1.d, z0.d\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
"mov z2.d, z0.d\n"
"mov z3.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z6.d, z4.d\n"
"mov z7.d, z4.d\n"
- "54:" // Height 4: parameters loaded
+ "50:" // Height 4: parameters loaded
".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
@@ -1343,7 +1307,7 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n"
".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n"
".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n"
- "tbz %x[flags], #5, 55f\n"
+ "tbz %x[flags], #5, 51f\n"
"and z4.d, z8.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z9.d, z1.d\n"
@@ -1392,17 +1356,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z21.s, z21.s, z5.s\n"
"sqadd z22.s, z22.s, z6.s\n"
"sqadd z23.s, z23.s, z7.s\n"
- "55:" // Height 4: no shift correction
+ "51:" // Height 4: no shift correction
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "ld1rw { z5.s }, p2/Z, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1rw { z5.s }, p2/Z, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
- "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "ld1rw { z6.s }, p2/Z, [x24]\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
"add z8.s, z8.s, z4.s\n"
"add z9.s, z9.s, z4.s\n"
@@ -1423,9 +1387,9 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z9.h, z10.h, z11.h\n"
"smax z12.s, p2/M, z12.s, z5.s\n"
"uzp1 z8.b, z8.b, z9.b\n"
- "st1b { z8.b }, p1, [x13]\n"
+ "st1b { z8.b }, p1, [x9]\n"
"add z13.s, z13.s, z4.s\n"
- "addvl x13, x13, #1\n"
+ "addvl x9, x9, #1\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
@@ -1448,76 +1412,55 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z13.h, z14.h, z15.h\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
"uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p1, [x9]\n"
+ "st1b { z12.b }, p1, [x23]\n"
"add z18.s, z18.s, z4.s\n"
- "addvl x9, x9, #1\n"
- "add z19.s, z19.s, z4.s\n"
"smax z17.s, p2/M, z17.s, z5.s\n"
+ "add z19.s, z19.s, z4.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
"smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
"add z20.s, z20.s, z4.s\n"
"smax z18.s, p2/M, z18.s, z5.s\n"
+ ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
"smax z19.s, p2/M, z19.s, z5.s\n"
"smin z20.s, p2/M, z20.s, z6.s\n"
- ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
+ "add z21.s, z21.s, z4.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"smax z20.s, p2/M, z20.s, z5.s\n"
- "add z21.s, z21.s, z4.s\n"
"add z22.s, z22.s, z4.s\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x27]\n"
+ "st1b { z16.b }, p1, [x22]\n"
"smin z21.s, p2/M, z21.s, z6.s\n"
- "addvl x27, x27, #1\n"
"smin z22.s, p2/M, z22.s, z6.s\n"
".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
"smax z21.s, p2/M, z21.s, z5.s\n"
- "add z23.s, z23.s, z4.s\n"
"smax z22.s, p2/M, z22.s, z5.s\n"
+ "add z23.s, z23.s, z4.s\n"
"uzp1 z20.h, z20.h, z21.h\n"
"smin z23.s, p2/M, z23.s, z6.s\n"
"smax z23.s, p2/M, z23.s, z5.s\n"
"uzp1 z21.h, z22.h, z23.h\n"
"uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x25]\n"
- "addvl x25, x25, #1\n"
- "56:" // Height 4: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 45b\n"
- "b 86f\n"
- "57:" // Height 5
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 58f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19\n"
- "add x25, x25, x19\n"
- "add x23, x23, x19\n"
- "b 59f\n"
- "58:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "add x27, x9, x19\n"
- "add x25, x27, x19\n"
- "add x23, x25, x19\n"
- "59:" // Height 5: Column loop
+ "st1b { z20.b }, p1, [x21]\n"
+ "52:" // Height 4: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 41b\n"
+ "b 80f\n"
+ "53:" // Height 5
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "54:" // Height 5: Column loop
"mov z8.s, #0x0\n"
"mov x19, #0x0\n"
"mov z9.s, #0x0\n"
- "whilelt p1.b, x19, x15\n"
+ "whilelt p1.b, x19, x10\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
@@ -1536,143 +1479,143 @@ void sve_hybrid_s8qs_dot_6x4VL (
"mov z25.s, #0x0\n"
"mov z26.s, #0x0\n"
"mov z27.s, #0x0\n"
- "60:" // Height 5: setup done
- "mov x12, #0x0\n"
- "61:" // Height 5: String loop
+ "55:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "56:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 62f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 57f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 63f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 58f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 63f\n"
- "62:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "63:" // Height 5: input setup done
- "cmp x11, #0x10\n"
- "ble 65f\n"
- "64:" // Height 5: Multiply loop: Main loop head
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x21, x21, x19\n"
+ "b 58f\n"
+ "57:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "58:" // Height 5: input setup done
+ "cmp x26, #0x10\n"
+ "ble 60f\n"
+ "59:" // Height 5: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x21, x21, #0x10\n"
"sdot z20.s, z6.b, z3.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x10\n"
"sdot z24.s, z6.b, z4.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"sdot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
"sdot z22.s, z6.b, z3.b[0]\n"
"sdot z26.s, z6.b, z4.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
"sdot z27.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
"sdot z24.s, z6.b, z4.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
"sdot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
"sdot z22.s, z6.b, z3.b[1]\n"
"sdot z26.s, z6.b, z4.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
"sdot z27.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
"sdot z24.s, z6.b, z4.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
"sdot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
"sdot z22.s, z6.b, z3.b[2]\n"
"sdot z26.s, z6.b, z4.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
"sdot z27.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
"sdot z24.s, z6.b, z4.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
"sdot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -1683,35 +1626,35 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[3]\n"
"sdot z23.s, z7.b, z3.b[3]\n"
"sdot z27.s, z7.b, z4.b[3]\n"
- "bgt 64b\n"
- "65:" // Height 5: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 59b\n"
+ "60:" // Height 5: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "add x21, x21, #0x10\n"
"sdot z17.s, z7.b, z2.b[0]\n"
"sdot z20.s, z6.b, z3.b[0]\n"
"sdot z24.s, z6.b, z4.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
"sdot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
@@ -1722,23 +1665,23 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
"sdot z27.s, z7.b, z4.b[0]\n"
- "ble 66f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 61f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
"sdot z24.s, z6.b, z4.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
"sdot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
@@ -1749,23 +1692,23 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
"sdot z27.s, z7.b, z4.b[1]\n"
- "ble 66f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 61f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
"sdot z24.s, z6.b, z4.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
"sdot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
@@ -1776,22 +1719,22 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
"sdot z27.s, z7.b, z4.b[2]\n"
- "ble 66f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 61f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
"sdot z24.s, z6.b, z4.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
"sdot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -1802,26 +1745,31 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[3]\n"
"sdot z23.s, z7.b, z3.b[3]\n"
"sdot z27.s, z7.b, z4.b[3]\n"
- "66:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "61:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 61b\n"
- "ld1w { z0.s }, p2/Z, [x16]\n"
+ "cmp x27, x19\n"
+ "bne 56b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z0.s }, p2/Z, [x11]\n"
"add z8.s, z8.s, z0.s\n"
- "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+ "add x23, x9, x19\n"
"add z12.s, z12.s, z0.s\n"
- "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x23, x19\n"
"add z16.s, z16.s, z0.s\n"
- "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
- "addvl x16, x16, #4\n"
+ "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+ "add x21, x22, x19\n"
"add z9.s, z9.s, z1.s\n"
+ "add x20, x21, x19\n"
"add z13.s, z13.s, z1.s\n"
+ "addvl x11, x11, #4\n"
"add z10.s, z10.s, z2.s\n"
"add z11.s, z11.s, z3.s\n"
"add z14.s, z14.s, z2.s\n"
@@ -1837,30 +1785,30 @@ void sve_hybrid_s8qs_dot_6x4VL (
"add z25.s, z25.s, z1.s\n"
"add z26.s, z26.s, z2.s\n"
"add z27.s, z27.s, z3.s\n"
- "tbz %x[flags], #4, 67f\n"
- "ld1w { z0.s }, p2/Z, [x17]\n"
- "ld1w { z4.s }, p2/Z, [x8]\n"
- "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
- "addvl x17, x17, #4\n"
- "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
- "addvl x8, x8, #4\n"
- "b 68f\n"
- "67:" // Height 5: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "tbz %x[flags], #4, 62f\n"
+ "ld1w { z0.s }, p2/Z, [x12]\n"
+ "ld1w { z4.s }, p2/Z, [x13]\n"
+ "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "addvl x12, x12, #4\n"
+ "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "b 63f\n"
+ "62:" // Height 5: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x24]\n"
"mov z1.d, z0.d\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
"mov z2.d, z0.d\n"
"mov z3.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z6.d, z4.d\n"
"mov z7.d, z4.d\n"
- "68:" // Height 5: parameters loaded
+ "63:" // Height 5: parameters loaded
".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
@@ -1881,7 +1829,7 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n"
".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n"
".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n"
- "tbz %x[flags], #5, 69f\n"
+ "tbz %x[flags], #5, 64f\n"
"and z4.d, z8.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z9.d, z1.d\n"
@@ -1942,17 +1890,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z25.s, z25.s, z5.s\n"
"sqadd z26.s, z26.s, z6.s\n"
"sqadd z27.s, z27.s, z7.s\n"
- "69:" // Height 5: no shift correction
+ "64:" // Height 5: no shift correction
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "ld1rw { z5.s }, p2/Z, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1rw { z5.s }, p2/Z, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
- "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "ld1rw { z6.s }, p2/Z, [x24]\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
"add z8.s, z8.s, z4.s\n"
"add z9.s, z9.s, z4.s\n"
@@ -1973,9 +1921,9 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z9.h, z10.h, z11.h\n"
"smax z12.s, p2/M, z12.s, z5.s\n"
"uzp1 z8.b, z8.b, z9.b\n"
- "st1b { z8.b }, p1, [x13]\n"
+ "st1b { z8.b }, p1, [x9]\n"
"add z13.s, z13.s, z4.s\n"
- "addvl x13, x13, #1\n"
+ "addvl x9, x9, #1\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
@@ -1998,29 +1946,27 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z13.h, z14.h, z15.h\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
"uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p1, [x9]\n"
+ "st1b { z12.b }, p1, [x23]\n"
"add z18.s, z18.s, z4.s\n"
- "addvl x9, x9, #1\n"
- "add z19.s, z19.s, z4.s\n"
"smax z17.s, p2/M, z17.s, z5.s\n"
+ "add z19.s, z19.s, z4.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
"smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
"add z20.s, z20.s, z4.s\n"
"smax z18.s, p2/M, z18.s, z5.s\n"
+ ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
"smax z19.s, p2/M, z19.s, z5.s\n"
"smin z20.s, p2/M, z20.s, z6.s\n"
- ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
+ "add z21.s, z21.s, z4.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"smax z20.s, p2/M, z20.s, z5.s\n"
- "add z21.s, z21.s, z4.s\n"
"add z22.s, z22.s, z4.s\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x27]\n"
+ "st1b { z16.b }, p1, [x22]\n"
"smin z21.s, p2/M, z21.s, z6.s\n"
- "addvl x27, x27, #1\n"
"smin z22.s, p2/M, z22.s, z6.s\n"
".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
@@ -2043,57 +1989,35 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
"uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x25]\n"
+ "st1b { z20.b }, p1, [x21]\n"
"add z27.s, z27.s, z4.s\n"
- "addvl x25, x25, #1\n"
"smin z26.s, p2/M, z26.s, z6.s\n"
"smin z27.s, p2/M, z27.s, z6.s\n"
"smax z26.s, p2/M, z26.s, z5.s\n"
"smax z27.s, p2/M, z27.s, z5.s\n"
"uzp1 z25.h, z26.h, z27.h\n"
"uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x23]\n"
- "addvl x23, x23, #1\n"
- "70:" // Height 5: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 59b\n"
- "b 86f\n"
- "71:" // Height 6
- "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
- "mov x16, %x[col_bias]\n"
- "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "st1b { z24.b }, p1, [x20]\n"
+ "65:" // Height 5: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 54b\n"
+ "b 80f\n"
+ "66:" // Height 6
+ "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x11, %x[col_bias]\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x9, %x[output_ptr]\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x20, #0x6\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 72f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19\n"
- "add x23, x23, x19\n"
- "add x21, x21, x19\n"
- "b 73f\n"
- "72:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19\n"
- "add x27, x9, x19\n"
- "add x25, x27, x19\n"
- "add x23, x25, x19\n"
- "add x21, x23, x19\n"
- "add %x[output_ptr], x21, x19\n"
- "73:" // Height 6: Column loop
+ "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+ "67:" // Height 6: Column loop
"mov z8.s, #0x0\n"
"mov x19, #0x0\n"
"mov z9.s, #0x0\n"
- "whilelt p1.b, x19, x15\n"
+ "whilelt p1.b, x19, x10\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
"mov z12.s, #0x0\n"
@@ -2116,77 +2040,77 @@ void sve_hybrid_s8qs_dot_6x4VL (
"mov z29.s, #0x0\n"
"mov z30.s, #0x0\n"
"mov z31.s, #0x0\n"
- "74:" // Height 6: setup done
- "mov x12, #0x0\n"
- "75:" // Height 6: String loop
+ "68:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "69:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 76f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 70f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 77f\n"
+ "cbnz x27, 71f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 77f\n"
- "76:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "77:" // Height 6: input setup done
- "cmp x11, #0x10\n"
- "ble 79f\n"
- "78:" // Height 6: Multiply loop: Main loop head
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "b 71f\n"
+ "70:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "71:" // Height 6: input setup done
+ "cmp x26, #0x10\n"
+ "ble 73f\n"
+ "72:" // Height 6: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
"ld1rqb { z5.b }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"sdot z20.s, z6.b, z3.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"add x20, x20, #0x10\n"
"sdot z24.s, z6.b, z4.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x10\n"
"sdot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"sdot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
@@ -2194,85 +2118,85 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z22.s, z6.b, z3.b[0]\n"
"sdot z26.s, z6.b, z4.b[0]\n"
"sdot z30.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
"sdot z27.s, z7.b, z4.b[0]\n"
"sdot z31.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
"sdot z24.s, z6.b, z4.b[1]\n"
"sdot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
"sdot z25.s, z7.b, z4.b[1]\n"
"sdot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
"sdot z22.s, z6.b, z3.b[1]\n"
"sdot z26.s, z6.b, z4.b[1]\n"
"sdot z30.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
"sdot z27.s, z7.b, z4.b[1]\n"
"sdot z31.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
"sdot z24.s, z6.b, z4.b[2]\n"
"sdot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
"sdot z25.s, z7.b, z4.b[2]\n"
"sdot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
"sdot z22.s, z6.b, z3.b[2]\n"
"sdot z26.s, z6.b, z4.b[2]\n"
"sdot z30.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
"sdot z27.s, z7.b, z4.b[2]\n"
"sdot z31.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
"sdot z24.s, z6.b, z4.b[3]\n"
"sdot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
"sdot z25.s, z7.b, z4.b[3]\n"
"sdot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -2285,39 +2209,39 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[3]\n"
"sdot z27.s, z7.b, z4.b[3]\n"
"sdot z31.s, z7.b, z5.b[3]\n"
- "bgt 78b\n"
- "79:" // Height 6: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 72b\n"
+ "73:" // Height 6: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
"ld1rqb { z5.b }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"sdot z20.s, z6.b, z3.b[0]\n"
"add x20, x20, #0x10\n"
"sdot z17.s, z7.b, z2.b[0]\n"
"sdot z24.s, z6.b, z4.b[0]\n"
"sdot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
"sdot z25.s, z7.b, z4.b[0]\n"
"sdot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
@@ -2330,25 +2254,25 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[0]\n"
"sdot z27.s, z7.b, z4.b[0]\n"
"sdot z31.s, z7.b, z5.b[0]\n"
- "ble 80f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 74f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
"sdot z24.s, z6.b, z4.b[1]\n"
"sdot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
"sdot z25.s, z7.b, z4.b[1]\n"
"sdot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
@@ -2361,25 +2285,25 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[1]\n"
"sdot z27.s, z7.b, z4.b[1]\n"
"sdot z31.s, z7.b, z5.b[1]\n"
- "ble 80f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 74f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
"sdot z24.s, z6.b, z4.b[2]\n"
"sdot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
"sdot z25.s, z7.b, z4.b[2]\n"
"sdot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
@@ -2392,24 +2316,24 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[2]\n"
"sdot z27.s, z7.b, z4.b[2]\n"
"sdot z31.s, z7.b, z5.b[2]\n"
- "ble 80f\n"
- "ld1b { z6.b }, p2/Z, [x14]\n"
+ "ble 74f\n"
+ "ld1b { z6.b }, p2/Z, [x28]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
"sdot z24.s, z6.b, z4.b[3]\n"
"sdot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
"sdot z25.s, z7.b, z4.b[3]\n"
"sdot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -2422,28 +2346,34 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[3]\n"
"sdot z27.s, z7.b, z4.b[3]\n"
"sdot z31.s, z7.b, z5.b[3]\n"
- "80:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "74:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 75b\n"
- "ld1w { z0.s }, p2/Z, [x16]\n"
+ "cmp x27, x19\n"
+ "bne 69b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z0.s }, p2/Z, [x11]\n"
"add z8.s, z8.s, z0.s\n"
- "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+ "add x23, x9, x19\n"
"add z12.s, z12.s, z0.s\n"
- "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x23, x19\n"
"add z16.s, z16.s, z0.s\n"
- "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
- "addvl x16, x16, #4\n"
+ "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+ "add x21, x22, x19\n"
"add z9.s, z9.s, z1.s\n"
+ "add x20, x21, x19\n"
"add z13.s, z13.s, z1.s\n"
+ "add x19, x20, x19\n"
"add z10.s, z10.s, z2.s\n"
+ "addvl x11, x11, #4\n"
"add z11.s, z11.s, z3.s\n"
"add z14.s, z14.s, z2.s\n"
"add z15.s, z15.s, z3.s\n"
@@ -2462,30 +2392,30 @@ void sve_hybrid_s8qs_dot_6x4VL (
"add z29.s, z29.s, z1.s\n"
"add z30.s, z30.s, z2.s\n"
"add z31.s, z31.s, z3.s\n"
- "tbz %x[flags], #4, 81f\n"
- "ld1w { z0.s }, p2/Z, [x17]\n"
- "ld1w { z4.s }, p2/Z, [x8]\n"
- "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
- "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
- "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
- "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
- "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
- "addvl x17, x17, #4\n"
- "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
- "addvl x8, x8, #4\n"
- "b 82f\n"
- "81:" // Height 6: per layer parameters
- "add x19, %x[qp], %[per_layer_right_shift]\n"
- "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "tbz %x[flags], #4, 75f\n"
+ "ld1w { z0.s }, p2/Z, [x12]\n"
+ "ld1w { z4.s }, p2/Z, [x13]\n"
+ "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+ "addvl x12, x12, #4\n"
+ "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "b 76f\n"
+ "75:" // Height 6: per layer parameters
+ "add x24, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x24]\n"
"mov z1.d, z0.d\n"
- "add x19, %x[qp], %[per_layer_mul]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
"mov z2.d, z0.d\n"
"mov z3.d, z0.d\n"
"mov z5.d, z4.d\n"
"mov z6.d, z4.d\n"
"mov z7.d, z4.d\n"
- "82:" // Height 6: parameters loaded
+ "76:" // Height 6: parameters loaded
".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
@@ -2510,7 +2440,7 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x04a577bd // sqrdmulh z29.s, z29.s, z5.s\n"
".inst 0x04a677de // sqrdmulh z30.s, z30.s, z6.s\n"
".inst 0x04a777ff // sqrdmulh z31.s, z31.s, z7.s\n"
- "tbz %x[flags], #5, 83f\n"
+ "tbz %x[flags], #5, 77f\n"
"and z4.d, z8.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z9.d, z1.d\n"
@@ -2583,17 +2513,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
"sqadd z29.s, z29.s, z5.s\n"
"sqadd z30.s, z30.s, z6.s\n"
"sqadd z31.s, z31.s, z7.s\n"
- "83:" // Height 6: no shift correction
+ "77:" // Height 6: no shift correction
".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
- "add x19, %x[qp], %[c_offset]\n"
- "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add x24, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x24]\n"
".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
- "add x19, %x[qp], %[minval]\n"
+ "add x24, %x[qp], %[minval]\n"
".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
- "ld1rw { z5.s }, p2/Z, [x19]\n"
- "add x19, %x[qp], %[maxval]\n"
+ "ld1rw { z5.s }, p2/Z, [x24]\n"
+ "add x24, %x[qp], %[maxval]\n"
".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
- "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "ld1rw { z6.s }, p2/Z, [x24]\n"
".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
"add z8.s, z8.s, z4.s\n"
"add z9.s, z9.s, z4.s\n"
@@ -2614,9 +2544,9 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z9.h, z10.h, z11.h\n"
"smax z12.s, p2/M, z12.s, z5.s\n"
"uzp1 z8.b, z8.b, z9.b\n"
- "st1b { z8.b }, p1, [x13]\n"
+ "st1b { z8.b }, p1, [x9]\n"
"add z13.s, z13.s, z4.s\n"
- "addvl x13, x13, #1\n"
+ "addvl x9, x9, #1\n"
".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
@@ -2639,29 +2569,27 @@ void sve_hybrid_s8qs_dot_6x4VL (
"uzp1 z13.h, z14.h, z15.h\n"
".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
"uzp1 z12.b, z12.b, z13.b\n"
- "st1b { z12.b }, p1, [x9]\n"
+ "st1b { z12.b }, p1, [x23]\n"
"add z18.s, z18.s, z4.s\n"
- "addvl x9, x9, #1\n"
- "add z19.s, z19.s, z4.s\n"
"smax z17.s, p2/M, z17.s, z5.s\n"
+ "add z19.s, z19.s, z4.s\n"
".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
"smin z18.s, p2/M, z18.s, z6.s\n"
- "smin z19.s, p2/M, z19.s, z6.s\n"
"uzp1 z16.h, z16.h, z17.h\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
"add z20.s, z20.s, z4.s\n"
"smax z18.s, p2/M, z18.s, z5.s\n"
+ ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
"smax z19.s, p2/M, z19.s, z5.s\n"
"smin z20.s, p2/M, z20.s, z6.s\n"
- ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
+ "add z21.s, z21.s, z4.s\n"
"uzp1 z17.h, z18.h, z19.h\n"
"smax z20.s, p2/M, z20.s, z5.s\n"
- "add z21.s, z21.s, z4.s\n"
"add z22.s, z22.s, z4.s\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x27]\n"
+ "st1b { z16.b }, p1, [x22]\n"
"smin z21.s, p2/M, z21.s, z6.s\n"
- "addvl x27, x27, #1\n"
"smin z22.s, p2/M, z22.s, z6.s\n"
".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
@@ -2684,58 +2612,55 @@ void sve_hybrid_s8qs_dot_6x4VL (
".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
"uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x25]\n"
+ "st1b { z20.b }, p1, [x21]\n"
"add z27.s, z27.s, z4.s\n"
- "addvl x25, x25, #1\n"
"smin z26.s, p2/M, z26.s, z6.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
".inst 0x4482883d // srshl z29.s, p2/M, z29.s, z1.s\n"
"smin z27.s, p2/M, z27.s, z6.s\n"
- ".inst 0x4482885e // srshl z30.s, p2/M, z30.s, z2.s\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
"add z28.s, z28.s, z4.s\n"
"add z29.s, z29.s, z4.s\n"
- "smax z26.s, p2/M, z26.s, z5.s\n"
- "add z30.s, z30.s, z4.s\n"
"smax z27.s, p2/M, z27.s, z5.s\n"
"smin z28.s, p2/M, z28.s, z6.s\n"
"smin z29.s, p2/M, z29.s, z6.s\n"
- "smin z30.s, p2/M, z30.s, z6.s\n"
+ ".inst 0x4482885e // srshl z30.s, p2/M, z30.s, z2.s\n"
"uzp1 z25.h, z26.h, z27.h\n"
"smax z28.s, p2/M, z28.s, z5.s\n"
"uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x23]\n"
+ "st1b { z24.b }, p1, [x20]\n"
+ "add z30.s, z30.s, z4.s\n"
"smax z29.s, p2/M, z29.s, z5.s\n"
- "addvl x23, x23, #1\n"
- "smax z30.s, p2/M, z30.s, z5.s\n"
".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
"uzp1 z28.h, z28.h, z29.h\n"
"add z31.s, z31.s, z4.s\n"
+ "smax z30.s, p2/M, z30.s, z5.s\n"
"smin z31.s, p2/M, z31.s, z6.s\n"
"smax z31.s, p2/M, z31.s, z5.s\n"
"uzp1 z29.h, z30.h, z31.h\n"
"uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p1, [x21]\n"
- "addvl x21, x21, #1\n"
- "84:" // Height 6: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 73b\n"
+ "st1b { z28.b }, p1, [x19]\n"
+ "78:" // Height 6: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 67b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 86f\n"
+ "beq 80f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 85f\n"
+ "tbz %x[flags], #3, 79f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "85:" // Update direct input
+ "79:" // Update direct input
"mov x19, #0x6\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "86:" // Exit
+ "80:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "p0", "p1", "p2", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index a6652fd1b2..b2c376196f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -36,7 +36,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void sve_hybrid_s8s32_dot_6x4VL( ARGLIST );
@@ -73,7 +72,6 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_s8s32_dot_6x4VL;
-
cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
index b2ebce80d2..8862b3665a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -80,197 +80,184 @@ void sve_hybrid_s8s32_dot_6x4VL (
"ptrue p5.b\n"
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 61f\n"
+ "bge 56f\n"
"cmp %x[M], #0x4\n"
- "bgt 49f\n"
- "beq 37f\n"
+ "bgt 45f\n"
+ "beq 34f\n"
"cmp %x[M], #0x2\n"
- "bgt 25f\n"
- "beq 13f\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "bgt 23f\n"
+ "beq 12f\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 4f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "b 5f\n"
- "4:" // Height 1: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 3f\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "b 4f\n"
+ "3:" // Height 1: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
- "5:" // Height 1: setup done
- "mov x12, #0x0\n"
- "6:" // Height 1: String loop
+ "4:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 7f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 8f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 7f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "b 8f\n"
- "7:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "8:" // Height 1: input setup done
- "cmp x11, #0x10\n"
- "ble 10f\n"
- "9:" // Height 1: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x25, x25, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x26, #0x10\n"
+ "ble 9f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "cmp x11, #0x10\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "cmp x26, #0x10\n"
"sdot z10.s, z6.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
- "bgt 9b\n"
- "10:" // Height 1: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 8b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
- "ble 11f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[1]\n"
- "addvl x14, x14, #4\n"
+ "addvl x9, x9, #4\n"
"sdot z11.s, z7.b, z0.b[1]\n"
- "ble 11f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
- "addvl x14, x14, #4\n"
+ "addvl x9, x9, #4\n"
"sdot z11.s, z7.b, z0.b[2]\n"
- "ble 11f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
- "11:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
+ "10:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 6b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "12:" // Height 1: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 3b\n"
- "b 74f\n"
- "13:" // Height 2
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 14f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #2\n"
- "b 15f\n"
- "14:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "15:" // Height 2: Column loop
+ "cmp x27, x19\n"
+ "bne 5b\n"
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "11:" // Height 1: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 2b\n"
+ "b 68f\n"
+ "12:" // Height 2
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "13:" // Height 2: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 16f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "b 17f\n"
- "16:" // Height 2: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 14f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 15f\n"
+ "14:" // Height 2: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -279,214 +266,206 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov z13.s, #0x0\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
- "17:" // Height 2: setup done
- "mov x12, #0x0\n"
- "18:" // Height 2: String loop
+ "15:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 19f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 17f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 20f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 18f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "b 20f\n"
- "19:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "20:" // Height 2: input setup done
- "cmp x11, #0x10\n"
- "ble 22f\n"
- "21:" // Height 2: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "b 18f\n"
+ "17:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "18:" // Height 2: input setup done
+ "cmp x26, #0x10\n"
+ "ble 20f\n"
+ "19:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "cmp x11, #0x10\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "cmp x26, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
"sdot z15.s, z7.b, z1.b[3]\n"
- "bgt 21b\n"
- "22:" // Height 2: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 19b\n"
+ "20:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
- "ble 23f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 21f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
- "ble 23f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 21f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
- "ble 23f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 21f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
"sdot z15.s, z7.b, z1.b[3]\n"
- "23:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "21:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 18b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "24:" // Height 2: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 15b\n"
- "b 74f\n"
- "25:" // Height 3
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "cmp x27, x19\n"
+ "bne 16b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 26f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 27f\n"
- "26:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "27:" // Height 3: Column loop
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "22:" // Height 2: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 13b\n"
+ "b 68f\n"
+ "23:" // Height 3
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "24:" // Height 3: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 28f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "b 29f\n"
- "28:" // Height 3: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 25f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "b 26f\n"
+ "25:" // Height 3: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -499,267 +478,257 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
- "29:" // Height 3: setup done
- "mov x12, #0x0\n"
- "30:" // Height 3: String loop
+ "26:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 31f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 28f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 32f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 29f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
- "b 32f\n"
- "31:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "32:" // Height 3: input setup done
- "cmp x11, #0x10\n"
- "ble 34f\n"
- "33:" // Height 3: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 29f\n"
+ "28:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "29:" // Height 3: input setup done
+ "cmp x26, #0x10\n"
+ "ble 31f\n"
+ "30:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "cmp x11, #0x10\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "cmp x26, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
"sdot z15.s, z7.b, z1.b[3]\n"
"sdot z19.s, z7.b, z2.b[3]\n"
- "bgt 33b\n"
- "34:" // Height 3: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 30b\n"
+ "31:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "add x26, x26, #0x10\n"
+ "add x23, x23, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
- "ble 35f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 32f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
- "ble 35f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 32f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
- "ble 35f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 32f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
"sdot z11.s, z7.b, z0.b[3]\n"
"sdot z15.s, z7.b, z1.b[3]\n"
"sdot z19.s, z7.b, z2.b[3]\n"
- "35:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "32:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 30b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "36:" // Height 3: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 27b\n"
- "b 74f\n"
- "37:" // Height 4
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "cmp x27, x19\n"
+ "bne 27b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 38f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 39f\n"
- "38:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "39:" // Height 4: Column loop
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "33:" // Height 3: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 24b\n"
+ "b 68f\n"
+ "34:" // Height 4
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "35:" // Height 4: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 40f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "b 41f\n"
- "40:" // Height 4: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 36f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 37f\n"
+ "36:" // Height 4: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -776,123 +745,123 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov z21.s, #0x0\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
- "41:" // Height 4: setup done
- "mov x12, #0x0\n"
- "42:" // Height 4: String loop
+ "37:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 43f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 39f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 44f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 40f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
- "b 44f\n"
- "43:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "44:" // Height 4: input setup done
- "cmp x11, #0x10\n"
- "ble 46f\n"
- "45:" // Height 4: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 40f\n"
+ "39:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "40:" // Height 4: input setup done
+ "cmp x26, #0x10\n"
+ "ble 42f\n"
+ "41:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x10\n"
"sdot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
"sdot z22.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
"sdot z22.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
"sdot z22.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -901,31 +870,31 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[3]\n"
"sdot z19.s, z7.b, z2.b[3]\n"
"sdot z23.s, z7.b, z3.b[3]\n"
- "bgt 45b\n"
- "46:" // Height 4: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 41b\n"
+ "42:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "add x24, x24, #0x10\n"
+ "add x22, x22, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
"sdot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
@@ -934,21 +903,21 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
- "ble 47f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 43f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
@@ -957,21 +926,21 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
- "ble 47f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 43f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
@@ -980,20 +949,20 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
- "ble 47f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 43f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -1002,94 +971,82 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z15.s, z7.b, z1.b[3]\n"
"sdot z19.s, z7.b, z2.b[3]\n"
"sdot z23.s, z7.b, z3.b[3]\n"
- "47:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "43:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 42b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "48:" // Height 4: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 39b\n"
- "b 74f\n"
- "49:" // Height 5
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "cmp x27, x19\n"
+ "bne 38b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 50f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 51f\n"
- "50:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "51:" // Height 5: Column loop
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x21]\n"
+ "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+ "44:" // Height 4: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 35b\n"
+ "b 68f\n"
+ "45:" // Height 5
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "46:" // Height 5: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 52f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x23]\n"
- "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
- "b 53f\n"
- "52:" // Height 5: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 47f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x20]\n"
+ "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "b 48f\n"
+ "47:" // Height 5: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -1110,143 +1067,143 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov z25.s, #0x0\n"
"mov z26.s, #0x0\n"
"mov z27.s, #0x0\n"
- "53:" // Height 5: setup done
- "mov x12, #0x0\n"
- "54:" // Height 5: String loop
+ "48:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 55f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 56f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 51f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 56f\n"
- "55:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "56:" // Height 5: input setup done
- "cmp x11, #0x10\n"
- "ble 58f\n"
- "57:" // Height 5: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x21, x21, x19\n"
+ "b 51f\n"
+ "50:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "51:" // Height 5: input setup done
+ "cmp x26, #0x10\n"
+ "ble 53f\n"
+ "52:" // Height 5: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x21, x21, #0x10\n"
"sdot z20.s, z6.b, z3.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x10\n"
"sdot z24.s, z6.b, z4.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"sdot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
"sdot z22.s, z6.b, z3.b[0]\n"
"sdot z26.s, z6.b, z4.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
"sdot z27.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
"sdot z24.s, z6.b, z4.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
"sdot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
"sdot z22.s, z6.b, z3.b[1]\n"
"sdot z26.s, z6.b, z4.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
"sdot z27.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
"sdot z24.s, z6.b, z4.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
"sdot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
"sdot z22.s, z6.b, z3.b[2]\n"
"sdot z26.s, z6.b, z4.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
"sdot z27.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
"sdot z24.s, z6.b, z4.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
"sdot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -1257,35 +1214,35 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[3]\n"
"sdot z23.s, z7.b, z3.b[3]\n"
"sdot z27.s, z7.b, z4.b[3]\n"
- "bgt 57b\n"
- "58:" // Height 5: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 52b\n"
+ "53:" // Height 5: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "add x21, x21, #0x10\n"
"sdot z17.s, z7.b, z2.b[0]\n"
"sdot z20.s, z6.b, z3.b[0]\n"
"sdot z24.s, z6.b, z4.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
"sdot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
@@ -1296,23 +1253,23 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
"sdot z27.s, z7.b, z4.b[0]\n"
- "ble 59f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 54f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
"sdot z24.s, z6.b, z4.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
"sdot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
@@ -1323,23 +1280,23 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
"sdot z27.s, z7.b, z4.b[1]\n"
- "ble 59f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 54f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
"sdot z24.s, z6.b, z4.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
"sdot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
@@ -1350,22 +1307,22 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
"sdot z27.s, z7.b, z4.b[2]\n"
- "ble 59f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 54f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
"sdot z24.s, z6.b, z4.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
"sdot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -1376,109 +1333,96 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z19.s, z7.b, z2.b[3]\n"
"sdot z23.s, z7.b, z3.b[3]\n"
"sdot z27.s, z7.b, z4.b[3]\n"
- "59:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "54:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 54b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "60:" // Height 5: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 51b\n"
- "b 74f\n"
- "61:" // Height 6
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "cmp x27, x19\n"
+ "bne 49b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x21]\n"
+ "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x20]\n"
+ "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+ "55:" // Height 5: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 46b\n"
+ "b 68f\n"
+ "56:" // Height 6
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x20, #0x18\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 62f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 63f\n"
- "62:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "63:" // Height 6: Column loop
+ "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+ "57:" // Height 6: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 64f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x23]\n"
- "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
- "b 65f\n"
- "64:" // Height 6: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 58f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "add x19, x20, x19, LSL #2\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x20]\n"
+ "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x19]\n"
+ "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n"
+ "b 59f\n"
+ "58:" // Height 6: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -1503,77 +1447,77 @@ void sve_hybrid_s8s32_dot_6x4VL (
"mov z29.s, #0x0\n"
"mov z30.s, #0x0\n"
"mov z31.s, #0x0\n"
- "65:" // Height 6: setup done
- "mov x12, #0x0\n"
- "66:" // Height 6: String loop
+ "59:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 67f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 61f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 68f\n"
+ "cbnz x27, 62f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 68f\n"
- "67:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "68:" // Height 6: input setup done
- "cmp x11, #0x10\n"
- "ble 70f\n"
- "69:" // Height 6: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "b 62f\n"
+ "61:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "62:" // Height 6: input setup done
+ "cmp x26, #0x10\n"
+ "ble 64f\n"
+ "63:" // Height 6: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
"ld1rqb { z5.b }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"sdot z20.s, z6.b, z3.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"add x20, x20, #0x10\n"
"sdot z24.s, z6.b, z4.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x10\n"
"sdot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "sdot z25.s, z7.b, z4.b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"sdot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
@@ -1581,85 +1525,85 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z22.s, z6.b, z3.b[0]\n"
"sdot z26.s, z6.b, z4.b[0]\n"
"sdot z30.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[0]\n"
"sdot z15.s, z7.b, z1.b[0]\n"
"sdot z19.s, z7.b, z2.b[0]\n"
"sdot z23.s, z7.b, z3.b[0]\n"
"sdot z27.s, z7.b, z4.b[0]\n"
"sdot z31.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
"sdot z24.s, z6.b, z4.b[1]\n"
"sdot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
"sdot z25.s, z7.b, z4.b[1]\n"
"sdot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
"sdot z22.s, z6.b, z3.b[1]\n"
"sdot z26.s, z6.b, z4.b[1]\n"
"sdot z30.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[1]\n"
"sdot z15.s, z7.b, z1.b[1]\n"
"sdot z19.s, z7.b, z2.b[1]\n"
"sdot z23.s, z7.b, z3.b[1]\n"
"sdot z27.s, z7.b, z4.b[1]\n"
"sdot z31.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
"sdot z24.s, z6.b, z4.b[2]\n"
"sdot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
"sdot z25.s, z7.b, z4.b[2]\n"
"sdot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
"sdot z22.s, z6.b, z3.b[2]\n"
"sdot z26.s, z6.b, z4.b[2]\n"
"sdot z30.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"sdot z11.s, z7.b, z0.b[2]\n"
"sdot z15.s, z7.b, z1.b[2]\n"
"sdot z19.s, z7.b, z2.b[2]\n"
"sdot z23.s, z7.b, z3.b[2]\n"
"sdot z27.s, z7.b, z4.b[2]\n"
"sdot z31.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
"sdot z24.s, z6.b, z4.b[3]\n"
"sdot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
"sdot z25.s, z7.b, z4.b[3]\n"
"sdot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -1672,39 +1616,39 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[3]\n"
"sdot z27.s, z7.b, z4.b[3]\n"
"sdot z31.s, z7.b, z5.b[3]\n"
- "bgt 69b\n"
- "70:" // Height 6: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 63b\n"
+ "64:" // Height 6: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"sdot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"sdot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"sdot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"sdot z13.s, z7.b, z1.b[0]\n"
"ld1rqb { z5.b }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"sdot z20.s, z6.b, z3.b[0]\n"
"add x20, x20, #0x10\n"
"sdot z17.s, z7.b, z2.b[0]\n"
"sdot z24.s, z6.b, z4.b[0]\n"
"sdot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z21.s, z7.b, z3.b[0]\n"
"sdot z25.s, z7.b, z4.b[0]\n"
"sdot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[0]\n"
"sdot z14.s, z6.b, z1.b[0]\n"
"sdot z18.s, z6.b, z2.b[0]\n"
@@ -1717,25 +1661,25 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[0]\n"
"sdot z27.s, z7.b, z4.b[0]\n"
"sdot z31.s, z7.b, z5.b[0]\n"
- "ble 71f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 65f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[1]\n"
"sdot z16.s, z6.b, z2.b[1]\n"
"sdot z20.s, z6.b, z3.b[1]\n"
"sdot z24.s, z6.b, z4.b[1]\n"
"sdot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[1]\n"
"sdot z13.s, z7.b, z1.b[1]\n"
"sdot z17.s, z7.b, z2.b[1]\n"
"sdot z21.s, z7.b, z3.b[1]\n"
"sdot z25.s, z7.b, z4.b[1]\n"
"sdot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[1]\n"
"sdot z14.s, z6.b, z1.b[1]\n"
"sdot z18.s, z6.b, z2.b[1]\n"
@@ -1748,25 +1692,25 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[1]\n"
"sdot z27.s, z7.b, z4.b[1]\n"
"sdot z31.s, z7.b, z5.b[1]\n"
- "ble 71f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 65f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"sdot z12.s, z6.b, z1.b[2]\n"
"sdot z16.s, z6.b, z2.b[2]\n"
"sdot z20.s, z6.b, z3.b[2]\n"
"sdot z24.s, z6.b, z4.b[2]\n"
"sdot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[2]\n"
"sdot z13.s, z7.b, z1.b[2]\n"
"sdot z17.s, z7.b, z2.b[2]\n"
"sdot z21.s, z7.b, z3.b[2]\n"
"sdot z25.s, z7.b, z4.b[2]\n"
"sdot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[2]\n"
"sdot z14.s, z6.b, z1.b[2]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
@@ -1779,24 +1723,24 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[2]\n"
"sdot z27.s, z7.b, z4.b[2]\n"
"sdot z31.s, z7.b, z5.b[2]\n"
- "ble 71f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 65f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"sdot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"sdot z12.s, z6.b, z1.b[3]\n"
"sdot z16.s, z6.b, z2.b[3]\n"
"sdot z20.s, z6.b, z3.b[3]\n"
"sdot z24.s, z6.b, z4.b[3]\n"
"sdot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"sdot z9.s, z7.b, z0.b[3]\n"
"sdot z13.s, z7.b, z1.b[3]\n"
"sdot z17.s, z7.b, z2.b[3]\n"
"sdot z21.s, z7.b, z3.b[3]\n"
"sdot z25.s, z7.b, z4.b[3]\n"
"sdot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"sdot z10.s, z6.b, z0.b[3]\n"
"sdot z14.s, z6.b, z1.b[3]\n"
"sdot z18.s, z6.b, z2.b[3]\n"
@@ -1809,67 +1753,68 @@ void sve_hybrid_s8s32_dot_6x4VL (
"sdot z23.s, z7.b, z3.b[3]\n"
"sdot z27.s, z7.b, z4.b[3]\n"
"sdot z31.s, z7.b, z5.b[3]\n"
- "71:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "65:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 66b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "st1w { z28.s }, p4, [x21]\n"
- "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #4\n"
- "72:" // Height 6: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 63b\n"
+ "cmp x27, x19\n"
+ "bne 60b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "add x19, x20, x19, LSL #2\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x21]\n"
+ "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x20]\n"
+ "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+ "st1w { z28.s }, p4, [x19]\n"
+ "st1w { z29.s }, p3, [x19, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x19, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x19, #3, MUL VL]\n"
+ "66:" // Height 6: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 57b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 74f\n"
+ "beq 68f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 73f\n"
+ "tbz %x[flags], #3, 67f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "73:" // Update direct input
+ "67:" // Update direct input
"mov x19, #0x6\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "74:" // Exit
+ "68:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index bacf2351ac..cfb8adfc87 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -36,7 +36,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void sve_hybrid_u8qa_dot_4x4VL( ARGLIST );
@@ -73,7 +72,6 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_u8qa_dot_4x4VL;
-
cls_sve_hybrid_u8qa_dot_4x4VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
index 52210dca27..373d82930b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -81,152 +81,143 @@ void sve_hybrid_u8qa_dot_4x4VL (
"ptrue p2.b\n"
"1:" // Row loop
"cmp %x[M], #0x4\n"
- "bge 46f\n"
+ "bge 43f\n"
"cmp %x[M], #0x2\n"
- "bgt 31f\n"
- "beq 16f\n"
+ "bgt 29f\n"
+ "beq 15f\n"
"mov z11.s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov z12.s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "mov x10, %x[col_bias]\n"
- "mov z13.s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "bic %x[flags], %x[flags], #0x80000000\n"
- "mov z14.s, #0x0\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
"mov z15.b, #0x1\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "add x9, x9, x19\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x9, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x27, %x[col_bias]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov x26, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"mov z16.s, #0x0\n"
"mov x19, #0x0\n"
"mov z17.s, #0x0\n"
- "whilelt p1.b, x19, x12\n"
+ "whilelt p1.b, x19, x9\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
- "4:" // Height 1: setup done
- "mov x28, #0x0\n"
- "5:" // Height 1: String loop
+ "3:" // Height 1: setup done
+ "mov x25, #0x0\n"
+ "4:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 6f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 5f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "cbnz x28, 7f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "cbnz x25, 6f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "b 7f\n"
- "6:" // Height 1: setup direct input
- "mov x26, %x[input_ptr]\n"
- "7:" // Height 1: input setup done
- "cmp x27, #0x10\n"
- "ble 10f\n"
- "8:" // Height 1: Multiply loop: Main loop head
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "add x23, x23, x19\n"
+ "b 6f\n"
+ "5:" // Height 1: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "6:" // Height 1: input setup done
+ "cmp x24, #0x10\n"
+ "ble 9f\n"
+ "7:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"udot z16.s, z4.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "add x26, x26, #0x10\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x23, #0x10\n"
"udot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
"udot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
"udot z19.s, z7.b, z0.b[0]\n"
"udot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"udot z17.s, z9.b, z0.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
"udot z18.s, z10.b, z0.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
"udot z19.s, z4.b, z0.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"udot z16.s, z5.b, z0.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
"udot z17.s, z6.b, z0.b[2]\n"
- "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
"udot z18.s, z7.b, z0.b[2]\n"
- "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
"udot z19.s, z8.b, z0.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
"udot z16.s, z9.b, z0.b[3]\n"
"udot z17.s, z10.b, z0.b[3]\n"
"udot z18.s, z4.b, z0.b[3]\n"
"udot z19.s, z5.b, z0.b[3]\n"
- "tbnz %x[flags], #31, 9f\n"
+ "tbnz %x[flags], #31, 8f\n"
"udot z11.s, z0.b, z15.b\n"
- "9:" // Height 1: Multiply loop: unique 1: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "cmp x27, #0x10\n"
- "bgt 8b\n"
- "10:" // Height 1: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "udot z16.s, z6.b, z0.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
- "add x26, x26, #0x10\n"
- "udot z17.s, z7.b, z0.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z18.s, z8.b, z0.b[0]\n"
- "udot z19.s, z9.b, z0.b[0]\n"
- "ble 11f\n"
- "ld1b { z10.b }, p2/Z, [x11]\n"
- "udot z16.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "udot z17.s, z4.b, z0.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
- "udot z18.s, z5.b, z0.b[1]\n"
- "addvl x11, x11, #4\n"
- "udot z19.s, z6.b, z0.b[1]\n"
- "ble 11f\n"
- "ld1b { z7.b }, p2/Z, [x11]\n"
- "udot z16.s, z7.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "udot z17.s, z8.b, z0.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
- "udot z18.s, z9.b, z0.b[2]\n"
- "addvl x11, x11, #4\n"
- "udot z19.s, z10.b, z0.b[2]\n"
- "ble 11f\n"
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "udot z16.s, z4.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "udot z17.s, z5.b, z0.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z18.s, z6.b, z0.b[3]\n"
- "udot z19.s, z7.b, z0.b[3]\n"
- "11:" // Height 1: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 12f\n"
+ "8:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "cmp x24, #0x10\n"
+ "bgt 7b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x23, x23, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "ble 10f\n"
+ "ld1b { z8.b }, p2/Z, [x28]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "ble 10f\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "ble 10f\n"
+ "ld1b { z9.b }, p2/Z, [x28]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "10:" // Height 1: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 11f\n"
"udot z11.s, z0.b, z15.b\n"
- "12:" // Height 1: Multiply loop: unique 2: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x28, x28, #0x1\n"
+ "11:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x25, x25, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x28, x19\n"
- "bne 5b\n"
- "tbnz %x[flags], #31, 13f\n"
+ "cmp x25, x19\n"
+ "bne 4b\n"
+ "tbnz %x[flags], #31, 12f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z1.s }, p2/Z, [x19]\n"
"neg z1.s, p2/M, z1.s\n"
@@ -235,21 +226,21 @@ void sve_hybrid_u8qa_dot_4x4VL (
"uaddv d11, p0, z11.s\n"
"mov z11.s, z11.s[0]\n"
"mul z11.s, p2/M, z11.s, z1.s\n"
- "13:" // Height 1: skip row sum fixup
+ "12:" // Height 1: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add z18.s, z18.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
"add x19, %x[qp], %[per_layer_mul]\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
"add z16.s, z16.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x23]\n"
"add z17.s, z17.s, z1.s\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"add z18.s, z18.s, z2.s\n"
@@ -258,7 +249,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
- "tbz %x[flags], #5, 14f\n"
+ "tbz %x[flags], #5, 13f\n"
"and z4.d, z16.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z17.d, z0.d\n"
@@ -271,7 +262,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"sqadd z17.s, z17.s, z5.s\n"
"sqadd z18.s, z18.s, z6.s\n"
"sqadd z19.s, z19.s, z7.s\n"
- "14:" // Height 1: no shift correction
+ "13:" // Height 1: no shift correction
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"add x19, %x[qp], %[c_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
@@ -297,107 +288,96 @@ void sve_hybrid_u8qa_dot_4x4VL (
"uzp1 z16.h, z16.h, z17.h\n"
"uzp1 z17.h, z18.h, z19.h\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x9]\n"
- "addvl x9, x9, #1\n"
- "15:" // Height 1: Writeback done
- "decw x12, ALL, MUL #4\n"
- "cmp x12, XZR\n"
- "bgt 3b\n"
- "b 62f\n"
- "16:" // Height 2
+ "st1b { z16.b }, p1, [x26]\n"
+ "addvl x26, x26, #1\n"
+ "14:" // Height 1: Writeback done
+ "decw x9, ALL, MUL #4\n"
+ "cmp x9, XZR\n"
+ "bgt 2b\n"
+ "b 58f\n"
+ "15:" // Height 2
"mov z11.s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"mov z12.s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
- "mov z13.s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "mov z14.s, #0x0\n"
"mov z15.b, #0x1\n"
- "tbz %x[flags], #2, 17f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "add x25, x25, x19\n"
- "b 18f\n"
- "17:" // Height 2: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "18:" // Height 2: Column loop
+ "mov x26, %x[output_ptr]\n"
+ "16:" // Height 2: Column loop
"mov z16.s, #0x0\n"
"mov x19, #0x0\n"
"mov z17.s, #0x0\n"
- "whilelt p1.b, x19, x12\n"
+ "whilelt p1.b, x19, x9\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
"mov z21.s, #0x0\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
- "19:" // Height 2: setup done
- "mov x28, #0x0\n"
- "20:" // Height 2: String loop
+ "17:" // Height 2: setup done
+ "mov x25, #0x0\n"
+ "18:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 21f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 19f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "cbnz x28, 22f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "cbnz x25, 20f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
- "b 22f\n"
- "21:" // Height 2: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "22:" // Height 2: input setup done
- "cmp x27, #0x10\n"
- "ble 25f\n"
- "23:" // Height 2: Multiply loop: Main loop head
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 20f\n"
+ "19:" // Height 2: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "20:" // Height 2: input setup done
+ "cmp x24, #0x10\n"
+ "ble 23f\n"
+ "21:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"udot z16.s, z4.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"udot z17.s, z5.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "add x24, x24, #0x10\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
"udot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"udot z21.s, z5.b, z1.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
"udot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
"udot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
"udot z19.s, z7.b, z0.b[0]\n"
- "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"udot z23.s, z7.b, z1.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
"udot z16.s, z8.b, z0.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
"udot z20.s, z8.b, z1.b[1]\n"
- "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
"udot z17.s, z9.b, z0.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"udot z21.s, z9.b, z1.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
"udot z18.s, z10.b, z0.b[1]\n"
"udot z22.s, z10.b, z1.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
"udot z19.s, z4.b, z0.b[1]\n"
"udot z23.s, z4.b, z1.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
"udot z16.s, z5.b, z0.b[2]\n"
"udot z20.s, z5.b, z1.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
"udot z17.s, z6.b, z0.b[2]\n"
"udot z21.s, z6.b, z1.b[2]\n"
"udot z18.s, z7.b, z0.b[2]\n"
@@ -412,91 +392,93 @@ void sve_hybrid_u8qa_dot_4x4VL (
"udot z22.s, z4.b, z1.b[3]\n"
"udot z19.s, z5.b, z0.b[3]\n"
"udot z23.s, z5.b, z1.b[3]\n"
- "tbnz %x[flags], #31, 24f\n"
+ "tbnz %x[flags], #31, 22f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z12.s, z1.b, z15.b\n"
- "24:" // Height 2: Multiply loop: unique 3: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x10\n"
- "bgt 23b\n"
- "25:" // Height 2: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "udot z16.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
- "udot z17.s, z7.b, z0.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
- "add x24, x24, #0x10\n"
- "udot z20.s, z6.b, z1.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z21.s, z7.b, z1.b[0]\n"
- "udot z18.s, z8.b, z0.b[0]\n"
- "udot z22.s, z8.b, z1.b[0]\n"
- "udot z19.s, z9.b, z0.b[0]\n"
- "udot z23.s, z9.b, z1.b[0]\n"
- "ble 26f\n"
- "ld1b { z10.b }, p2/Z, [x11]\n"
- "udot z16.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "udot z20.s, z10.b, z1.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
- "udot z17.s, z4.b, z0.b[1]\n"
- "addvl x11, x11, #4\n"
- "udot z21.s, z4.b, z1.b[1]\n"
- "udot z18.s, z5.b, z0.b[1]\n"
- "udot z22.s, z5.b, z1.b[1]\n"
- "udot z19.s, z6.b, z0.b[1]\n"
- "udot z23.s, z6.b, z1.b[1]\n"
- "ble 26f\n"
- "ld1b { z7.b }, p2/Z, [x11]\n"
- "udot z16.s, z7.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "udot z20.s, z7.b, z1.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
- "udot z17.s, z8.b, z0.b[2]\n"
- "addvl x11, x11, #4\n"
- "udot z21.s, z8.b, z1.b[2]\n"
- "udot z18.s, z9.b, z0.b[2]\n"
- "udot z22.s, z9.b, z1.b[2]\n"
- "udot z19.s, z10.b, z0.b[2]\n"
- "udot z23.s, z10.b, z1.b[2]\n"
- "ble 26f\n"
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "udot z16.s, z4.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "udot z20.s, z4.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "udot z17.s, z5.b, z0.b[3]\n"
- "addvl x11, x11, #4\n"
- "udot z21.s, z5.b, z1.b[3]\n"
- "udot z18.s, z6.b, z0.b[3]\n"
- "udot z22.s, z6.b, z1.b[3]\n"
- "udot z19.s, z7.b, z0.b[3]\n"
- "udot z23.s, z7.b, z1.b[3]\n"
- "26:" // Height 2: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 27f\n"
+ "22:" // Height 2: Multiply loop: unique 3: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x10\n"
+ "bgt 21b\n"
+ "23:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z21.s, z5.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "udot z22.s, z6.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "udot z23.s, z7.b, z1.b[0]\n"
+ "ble 24f\n"
+ "ld1b { z8.b }, p2/Z, [x28]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "udot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "addvl x28, x28, #4\n"
+ "udot z21.s, z9.b, z1.b[1]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z22.s, z10.b, z1.b[1]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z23.s, z4.b, z1.b[1]\n"
+ "ble 24f\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "udot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "addvl x28, x28, #4\n"
+ "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z22.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z23.s, z8.b, z1.b[2]\n"
+ "ble 24f\n"
+ "ld1b { z9.b }, p2/Z, [x28]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z20.s, z9.b, z1.b[3]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "addvl x28, x28, #4\n"
+ "udot z21.s, z10.b, z1.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z22.s, z4.b, z1.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z23.s, z5.b, z1.b[3]\n"
+ "24:" // Height 2: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 25f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z12.s, z1.b, z15.b\n"
- "27:" // Height 2: Multiply loop: unique 4: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x28, x28, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "25:" // Height 2: Multiply loop: unique 4: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x25, x25, #0x1\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x28, x19\n"
- "bne 20b\n"
- "tbnz %x[flags], #31, 28f\n"
+ "cmp x25, x19\n"
+ "bne 18b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x26, x19\n"
+ "tbnz %x[flags], #31, 26f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z2.s }, p2/Z, [x19]\n"
"neg z2.s, p2/M, z2.s\n"
@@ -508,19 +490,19 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov z12.s, z12.s[0]\n"
"mul z11.s, p2/M, z11.s, z2.s\n"
"mul z12.s, p2/M, z12.s, z2.s\n"
- "28:" // Height 2: skip row sum fixup
+ "26:" // Height 2: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add z18.s, z18.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
"add x19, %x[qp], %[per_layer_mul]\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
"add z20.s, z20.s, z12.s\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"add z21.s, z21.s, z12.s\n"
@@ -531,7 +513,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"add z18.s, z18.s, z2.s\n"
"add z19.s, z19.s, z3.s\n"
"add z20.s, z20.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x23]\n"
"add z21.s, z21.s, z1.s\n"
"add z22.s, z22.s, z2.s\n"
"add z23.s, z23.s, z3.s\n"
@@ -543,7 +525,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
- "tbz %x[flags], #5, 29f\n"
+ "tbz %x[flags], #5, 27f\n"
"and z4.d, z16.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z17.d, z0.d\n"
@@ -568,7 +550,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"sqadd z21.s, z21.s, z9.s\n"
"sqadd z22.s, z22.s, z10.s\n"
"sqadd z23.s, z23.s, z4.s\n"
- "29:" // Height 2: no shift correction
+ "27:" // Height 2: no shift correction
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"add x19, %x[qp], %[c_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
@@ -599,9 +581,9 @@ void sve_hybrid_u8qa_dot_4x4VL (
"uzp1 z17.h, z18.h, z19.h\n"
"smax z20.s, p2/M, z20.s, z5.s\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x9]\n"
+ "st1b { z16.b }, p1, [x26]\n"
"add z21.s, z21.s, z4.s\n"
- "addvl x9, x9, #1\n"
+ "addvl x26, x26, #1\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
"smin z21.s, p2/M, z21.s, z6.s\n"
@@ -615,41 +597,27 @@ void sve_hybrid_u8qa_dot_4x4VL (
"smax z23.s, p2/M, z23.s, z5.s\n"
"uzp1 z21.h, z22.h, z23.h\n"
"uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x25]\n"
- "addvl x25, x25, #1\n"
- "30:" // Height 2: Writeback done
- "decw x12, ALL, MUL #4\n"
- "cmp x12, XZR\n"
- "bgt 18b\n"
- "b 62f\n"
- "31:" // Height 3
+ "st1b { z20.b }, p1, [x22]\n"
+ "28:" // Height 2: Writeback done
+ "decw x9, ALL, MUL #4\n"
+ "cmp x9, XZR\n"
+ "bgt 16b\n"
+ "b 58f\n"
+ "29:" // Height 3
"mov z11.s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"mov z12.s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"mov z13.s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "mov z14.s, #0x0\n"
+ "mov x26, %x[output_ptr]\n"
"mov z15.b, #0x1\n"
- "tbz %x[flags], #2, 32f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "ldr x23, [%x[output_ptr], #0x10]\n"
- "add x25, x25, x19\n"
- "add x23, x23, x19\n"
- "b 33f\n"
- "32:" // Height 3: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "add x23, x25, x19\n"
- "33:" // Height 3: Column loop
+ "30:" // Height 3: Column loop
"mov z16.s, #0x0\n"
"mov x19, #0x0\n"
"mov z17.s, #0x0\n"
- "whilelt p1.b, x19, x12\n"
+ "whilelt p1.b, x19, x9\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
@@ -660,83 +628,83 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov z25.s, #0x0\n"
"mov z26.s, #0x0\n"
"mov z27.s, #0x0\n"
- "34:" // Height 3: setup done
- "mov x28, #0x0\n"
- "35:" // Height 3: String loop
+ "31:" // Height 3: setup done
+ "mov x25, #0x0\n"
+ "32:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 36f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 33f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
- "cbnz x28, 37f\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
+ "cbnz x25, 34f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 37f\n"
- "36:" // Height 3: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "37:" // Height 3: input setup done
- "cmp x27, #0x10\n"
- "ble 40f\n"
- "38:" // Height 3: Multiply loop: Main loop head
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "add x21, x21, x19\n"
+ "b 34f\n"
+ "33:" // Height 3: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "34:" // Height 3: input setup done
+ "cmp x24, #0x10\n"
+ "ble 37f\n"
+ "35:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"udot z16.s, z4.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"udot z17.s, z5.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "udot z20.s, z4.b, z1.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
"udot z24.s, z4.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"udot z21.s, z5.b, z1.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
"udot z25.s, z5.b, z2.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
"udot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
"udot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"udot z26.s, z6.b, z2.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
"udot z19.s, z7.b, z0.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
"udot z23.s, z7.b, z1.b[0]\n"
"udot z27.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
"udot z16.s, z8.b, z0.b[1]\n"
"udot z20.s, z8.b, z1.b[1]\n"
"udot z24.s, z8.b, z2.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"udot z17.s, z9.b, z0.b[1]\n"
"udot z21.s, z9.b, z1.b[1]\n"
"udot z25.s, z9.b, z2.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
"udot z18.s, z10.b, z0.b[1]\n"
"udot z22.s, z10.b, z1.b[1]\n"
"udot z26.s, z10.b, z2.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
"udot z19.s, z4.b, z0.b[1]\n"
"udot z23.s, z4.b, z1.b[1]\n"
"udot z27.s, z4.b, z2.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
"udot z16.s, z5.b, z0.b[2]\n"
"udot z20.s, z5.b, z1.b[2]\n"
"udot z24.s, z5.b, z2.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
"udot z17.s, z6.b, z0.b[2]\n"
"udot z21.s, z6.b, z1.b[2]\n"
"udot z25.s, z6.b, z2.b[2]\n"
@@ -758,113 +726,116 @@ void sve_hybrid_u8qa_dot_4x4VL (
"udot z19.s, z5.b, z0.b[3]\n"
"udot z23.s, z5.b, z1.b[3]\n"
"udot z27.s, z5.b, z2.b[3]\n"
- "tbnz %x[flags], #31, 39f\n"
+ "tbnz %x[flags], #31, 36f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z12.s, z1.b, z15.b\n"
"udot z13.s, z2.b, z15.b\n"
- "39:" // Height 3: Multiply loop: unique 5: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x10\n"
+ "36:" // Height 3: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
"prfm pldl1keep, [x22, #0x80]\n"
- "bgt 38b\n"
- "40:" // Height 3: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "udot z16.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
- "udot z17.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "udot z20.s, z6.b, z1.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "cmp x24, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
+ "bgt 35b\n"
+ "37:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
- "udot z24.s, z6.b, z2.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z21.s, z7.b, z1.b[0]\n"
- "udot z25.s, z7.b, z2.b[0]\n"
- "udot z18.s, z8.b, z0.b[0]\n"
- "udot z22.s, z8.b, z1.b[0]\n"
- "udot z26.s, z8.b, z2.b[0]\n"
- "udot z19.s, z9.b, z0.b[0]\n"
- "udot z23.s, z9.b, z1.b[0]\n"
- "udot z27.s, z9.b, z2.b[0]\n"
- "ble 41f\n"
- "ld1b { z10.b }, p2/Z, [x11]\n"
- "udot z16.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "udot z20.s, z10.b, z1.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
- "udot z24.s, z10.b, z2.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z17.s, z4.b, z0.b[1]\n"
- "udot z21.s, z4.b, z1.b[1]\n"
- "udot z25.s, z4.b, z2.b[1]\n"
- "udot z18.s, z5.b, z0.b[1]\n"
- "udot z22.s, z5.b, z1.b[1]\n"
- "udot z26.s, z5.b, z2.b[1]\n"
- "udot z19.s, z6.b, z0.b[1]\n"
- "udot z23.s, z6.b, z1.b[1]\n"
- "udot z27.s, z6.b, z2.b[1]\n"
- "ble 41f\n"
- "ld1b { z7.b }, p2/Z, [x11]\n"
- "udot z16.s, z7.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "udot z20.s, z7.b, z1.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
- "udot z24.s, z7.b, z2.b[2]\n"
- "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z17.s, z8.b, z0.b[2]\n"
- "udot z21.s, z8.b, z1.b[2]\n"
- "udot z25.s, z8.b, z2.b[2]\n"
- "udot z18.s, z9.b, z0.b[2]\n"
- "udot z22.s, z9.b, z1.b[2]\n"
- "udot z26.s, z9.b, z2.b[2]\n"
- "udot z19.s, z10.b, z0.b[2]\n"
- "udot z23.s, z10.b, z1.b[2]\n"
- "udot z27.s, z10.b, z2.b[2]\n"
- "ble 41f\n"
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "udot z16.s, z4.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "udot z20.s, z4.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "udot z24.s, z4.b, z2.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z17.s, z5.b, z0.b[3]\n"
- "udot z21.s, z5.b, z1.b[3]\n"
- "udot z25.s, z5.b, z2.b[3]\n"
- "udot z18.s, z6.b, z0.b[3]\n"
- "udot z22.s, z6.b, z1.b[3]\n"
- "udot z26.s, z6.b, z2.b[3]\n"
- "udot z19.s, z7.b, z0.b[3]\n"
- "udot z23.s, z7.b, z1.b[3]\n"
- "udot z27.s, z7.b, z2.b[3]\n"
- "41:" // Height 3: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 42f\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x21, x21, #0x10\n"
+ "udot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z21.s, z5.b, z1.b[0]\n"
+ "udot z25.s, z5.b, z2.b[0]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "udot z22.s, z6.b, z1.b[0]\n"
+ "udot z26.s, z6.b, z2.b[0]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "udot z23.s, z7.b, z1.b[0]\n"
+ "udot z27.s, z7.b, z2.b[0]\n"
+ "ble 38f\n"
+ "ld1b { z8.b }, p2/Z, [x28]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "udot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z8.b, z2.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "udot z21.s, z9.b, z1.b[1]\n"
+ "udot z25.s, z9.b, z2.b[1]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z22.s, z10.b, z1.b[1]\n"
+ "udot z26.s, z10.b, z2.b[1]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z23.s, z4.b, z1.b[1]\n"
+ "udot z27.s, z4.b, z2.b[1]\n"
+ "ble 38f\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "udot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z5.b, z2.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z25.s, z6.b, z2.b[2]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z22.s, z7.b, z1.b[2]\n"
+ "udot z26.s, z7.b, z2.b[2]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z23.s, z8.b, z1.b[2]\n"
+ "udot z27.s, z8.b, z2.b[2]\n"
+ "ble 38f\n"
+ "ld1b { z9.b }, p2/Z, [x28]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z20.s, z9.b, z1.b[3]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z9.b, z2.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z21.s, z10.b, z1.b[3]\n"
+ "udot z25.s, z10.b, z2.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z22.s, z4.b, z1.b[3]\n"
+ "udot z26.s, z4.b, z2.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z23.s, z5.b, z1.b[3]\n"
+ "udot z27.s, z5.b, z2.b[3]\n"
+ "38:" // Height 3: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 39f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z12.s, z1.b, z15.b\n"
"udot z13.s, z2.b, z15.b\n"
- "42:" // Height 3: Multiply loop: unique 6: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x28, x28, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "39:" // Height 3: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x25, x25, #0x1\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x28, x19\n"
- "bne 35b\n"
- "tbnz %x[flags], #31, 43f\n"
+ "cmp x25, x19\n"
+ "bne 32b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x26, x19\n"
+ "add x21, x22, x19\n"
+ "tbnz %x[flags], #31, 40f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z3.s }, p2/Z, [x19]\n"
"neg z3.s, p2/M, z3.s\n"
@@ -879,19 +850,19 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mul z11.s, p2/M, z11.s, z3.s\n"
"mul z12.s, p2/M, z12.s, z3.s\n"
"mul z13.s, p2/M, z13.s, z3.s\n"
- "43:" // Height 3: skip row sum fixup
+ "40:" // Height 3: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add z18.s, z18.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
"add x19, %x[qp], %[per_layer_mul]\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
"add z20.s, z20.s, z12.s\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"add z21.s, z21.s, z12.s\n"
@@ -910,7 +881,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"add z22.s, z22.s, z2.s\n"
"add z23.s, z23.s, z3.s\n"
"add z24.s, z24.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x23]\n"
"add z25.s, z25.s, z1.s\n"
"add z26.s, z26.s, z2.s\n"
"add z27.s, z27.s, z3.s\n"
@@ -926,7 +897,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
- "tbz %x[flags], #5, 44f\n"
+ "tbz %x[flags], #5, 41f\n"
"and z4.d, z16.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z17.d, z0.d\n"
@@ -963,7 +934,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"asr z8.s, z8.s, #0x1f\n"
"sqadd z26.s, z26.s, z7.s\n"
"sqadd z27.s, z27.s, z8.s\n"
- "44:" // Height 3: no shift correction
+ "41:" // Height 3: no shift correction
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"add x19, %x[qp], %[c_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
@@ -994,9 +965,9 @@ void sve_hybrid_u8qa_dot_4x4VL (
"uzp1 z17.h, z18.h, z19.h\n"
"smax z20.s, p2/M, z20.s, z5.s\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x9]\n"
+ "st1b { z16.b }, p1, [x26]\n"
"add z21.s, z21.s, z4.s\n"
- "addvl x9, x9, #1\n"
+ "addvl x26, x26, #1\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
@@ -1019,58 +990,42 @@ void sve_hybrid_u8qa_dot_4x4VL (
"uzp1 z21.h, z22.h, z23.h\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x25]\n"
+ "st1b { z20.b }, p1, [x22]\n"
"add z26.s, z26.s, z4.s\n"
- "addvl x25, x25, #1\n"
- "add z27.s, z27.s, z4.s\n"
"smax z25.s, p2/M, z25.s, z5.s\n"
+ "add z27.s, z27.s, z4.s\n"
"smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
"smax z26.s, p2/M, z26.s, z5.s\n"
"smax z27.s, p2/M, z27.s, z5.s\n"
"uzp1 z25.h, z26.h, z27.h\n"
"uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x23]\n"
- "addvl x23, x23, #1\n"
- "45:" // Height 3: Writeback done
- "decw x12, ALL, MUL #4\n"
- "cmp x12, XZR\n"
- "bgt 33b\n"
- "b 62f\n"
- "46:" // Height 4
+ "st1b { z24.b }, p1, [x21]\n"
+ "42:" // Height 3: Writeback done
+ "decw x9, ALL, MUL #4\n"
+ "cmp x9, XZR\n"
+ "bgt 30b\n"
+ "b 58f\n"
+ "43:" // Height 4
"mov z11.s, #0x0\n"
- "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
- "mov x10, %x[col_bias]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x27, %x[col_bias]\n"
"mov z12.s, #0x0\n"
- "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
"bic %x[flags], %x[flags], #0x80000000\n"
"mov z13.s, #0x0\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov x26, %x[output_ptr]\n"
"mov z14.s, #0x0\n"
+ "mov x19, #0x4\n"
"mov z15.b, #0x1\n"
- "tbz %x[flags], #2, 47f\n"
- "ldr x9, [%x[output_ptr], #0x0]\n"
- "ldr x25, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19\n"
- "ldr x23, [%x[output_ptr], #0x10]\n"
- "ldr x21, [%x[output_ptr], #0x18]\n"
- "add x25, x25, x19\n"
- "add %x[output_ptr], %x[output_ptr], #0x20\n"
- "add x23, x23, x19\n"
- "add x21, x21, x19\n"
- "b 48f\n"
- "47:" // Height 4: setup direct output
- "mov x9, %x[output_ptr]\n"
- "add x25, x9, x19\n"
- "add x23, x25, x19\n"
- "add x21, x23, x19\n"
- "add %x[output_ptr], x21, x19\n"
- "48:" // Height 4: Column loop
+ "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+ "44:" // Height 4: Column loop
"mov z16.s, #0x0\n"
"mov x19, #0x0\n"
"mov z17.s, #0x0\n"
- "whilelt p1.b, x19, x12\n"
+ "whilelt p1.b, x19, x9\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
"mov z20.s, #0x0\n"
@@ -1085,97 +1040,97 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mov z29.s, #0x0\n"
"mov z30.s, #0x0\n"
"mov z31.s, #0x0\n"
- "49:" // Height 4: setup done
- "mov x28, #0x0\n"
- "50:" // Height 4: String loop
+ "45:" // Height 4: setup done
+ "mov x25, #0x0\n"
+ "46:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w27, [x20, x28, LSL #0x2]\n"
- "tbz %x[flags], #3, 51f\n"
- "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "ldr w24, [x20, x25, LSL #0x2]\n"
+ "tbz %x[flags], #3, 47f\n"
+ "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x26, [x20, #0x0]\n"
- "ldr x24, [x20, #0x8]\n"
- "ldr x22, [x20, #0x10]\n"
+ "ldr x23, [x20, #0x0]\n"
+ "ldr x22, [x20, #0x8]\n"
+ "ldr x21, [x20, #0x10]\n"
"ldr x20, [x20, #0x18]\n"
- "cbnz x28, 52f\n"
+ "cbnz x25, 48f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x26, x26, x19\n"
- "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 52f\n"
- "51:" // Height 4: setup direct input
- "mov x26, %x[input_ptr]\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "52:" // Height 4: input setup done
- "cmp x27, #0x10\n"
- "ble 55f\n"
- "53:" // Height 4: Multiply loop: Main loop head
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "b 48f\n"
+ "47:" // Height 4: setup direct input
+ "mov x23, %x[input_ptr]\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "48:" // Height 4: input setup done
+ "cmp x24, #0x10\n"
+ "ble 51f\n"
+ "49:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
"udot z16.s, z4.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"udot z17.s, z5.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"udot z20.s, z4.b, z1.b[0]\n"
"ld1rqb { z3.b }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"udot z24.s, z4.b, z2.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"add x20, x20, #0x10\n"
"udot z21.s, z5.b, z1.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
"udot z25.s, z5.b, z2.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #4, MUL VL]\n"
"udot z28.s, z4.b, z3.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #5, MUL VL]\n"
"udot z29.s, z5.b, z3.b[0]\n"
- "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #6, MUL VL]\n"
"udot z18.s, z6.b, z0.b[0]\n"
- "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
- "addvl x11, x11, #16\n"
+ "ld1b { z4.b }, p2/Z, [x28, #7, MUL VL]\n"
+ "addvl x28, x28, #16\n"
"udot z22.s, z6.b, z1.b[0]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-8, MUL VL]\n"
"udot z26.s, z6.b, z2.b[0]\n"
"udot z30.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
"udot z19.s, z7.b, z0.b[0]\n"
"udot z23.s, z7.b, z1.b[0]\n"
"udot z27.s, z7.b, z2.b[0]\n"
"udot z31.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
"udot z16.s, z8.b, z0.b[1]\n"
"udot z20.s, z8.b, z1.b[1]\n"
"udot z24.s, z8.b, z2.b[1]\n"
"udot z28.s, z8.b, z3.b[1]\n"
- "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #-5, MUL VL]\n"
"udot z17.s, z9.b, z0.b[1]\n"
"udot z21.s, z9.b, z1.b[1]\n"
"udot z25.s, z9.b, z2.b[1]\n"
"udot z29.s, z9.b, z3.b[1]\n"
- "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #-4, MUL VL]\n"
"udot z18.s, z10.b, z0.b[1]\n"
"udot z22.s, z10.b, z1.b[1]\n"
"udot z26.s, z10.b, z2.b[1]\n"
"udot z30.s, z10.b, z3.b[1]\n"
- "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #-3, MUL VL]\n"
"udot z19.s, z4.b, z0.b[1]\n"
"udot z23.s, z4.b, z1.b[1]\n"
"udot z27.s, z4.b, z2.b[1]\n"
"udot z31.s, z4.b, z3.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #-2, MUL VL]\n"
"udot z16.s, z5.b, z0.b[2]\n"
"udot z20.s, z5.b, z1.b[2]\n"
"udot z24.s, z5.b, z2.b[2]\n"
"udot z28.s, z5.b, z3.b[2]\n"
- "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #-1, MUL VL]\n"
"udot z17.s, z6.b, z0.b[2]\n"
"udot z21.s, z6.b, z1.b[2]\n"
"udot z25.s, z6.b, z2.b[2]\n"
@@ -1204,135 +1159,139 @@ void sve_hybrid_u8qa_dot_4x4VL (
"udot z23.s, z5.b, z1.b[3]\n"
"udot z27.s, z5.b, z2.b[3]\n"
"udot z31.s, z5.b, z3.b[3]\n"
- "tbnz %x[flags], #31, 54f\n"
+ "tbnz %x[flags], #31, 50f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z12.s, z1.b, z15.b\n"
"udot z13.s, z2.b, z15.b\n"
"udot z14.s, z3.b, z15.b\n"
- "54:" // Height 4: Multiply loop: unique 7: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "sub x27, x27, #0x10\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "cmp x27, #0x10\n"
+ "50:" // Height 4: Multiply loop: unique 7: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "sub x24, x24, #0x10\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "cmp x24, #0x10\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
- "bgt 53b\n"
- "55:" // Height 4: Multiply loop: Single iteration only
- "ld1b { z6.b }, p2/Z, [x11]\n"
- "whilelt p0.b, XZR, x27\n"
- "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x26]\n"
- "udot z16.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
- "udot z17.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "udot z20.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "bgt 49b\n"
+ "51:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z4.b }, p2/Z, [x28]\n"
+ "whilelt p0.b, XZR, x24\n"
+ "ld1b { z5.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x23]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
- "udot z24.s, z6.b, z2.b[0]\n"
- "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x21, x21, #0x10\n"
+ "udot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
"add x20, x20, #0x10\n"
- "udot z21.s, z7.b, z1.b[0]\n"
- "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z28.s, z6.b, z3.b[0]\n"
- "udot z25.s, z7.b, z2.b[0]\n"
- "udot z29.s, z7.b, z3.b[0]\n"
- "udot z18.s, z8.b, z0.b[0]\n"
- "udot z22.s, z8.b, z1.b[0]\n"
- "udot z26.s, z8.b, z2.b[0]\n"
- "udot z30.s, z8.b, z3.b[0]\n"
- "udot z19.s, z9.b, z0.b[0]\n"
- "udot z23.s, z9.b, z1.b[0]\n"
- "udot z27.s, z9.b, z2.b[0]\n"
- "udot z31.s, z9.b, z3.b[0]\n"
- "ble 56f\n"
- "ld1b { z10.b }, p2/Z, [x11]\n"
- "udot z16.s, z10.b, z0.b[1]\n"
- "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "udot z20.s, z10.b, z1.b[1]\n"
- "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
- "udot z24.s, z10.b, z2.b[1]\n"
- "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z28.s, z10.b, z3.b[1]\n"
- "udot z17.s, z4.b, z0.b[1]\n"
- "udot z21.s, z4.b, z1.b[1]\n"
- "udot z25.s, z4.b, z2.b[1]\n"
- "udot z29.s, z4.b, z3.b[1]\n"
- "udot z18.s, z5.b, z0.b[1]\n"
- "udot z22.s, z5.b, z1.b[1]\n"
- "udot z26.s, z5.b, z2.b[1]\n"
- "udot z30.s, z5.b, z3.b[1]\n"
- "udot z19.s, z6.b, z0.b[1]\n"
- "udot z23.s, z6.b, z1.b[1]\n"
- "udot z27.s, z6.b, z2.b[1]\n"
- "udot z31.s, z6.b, z3.b[1]\n"
- "ble 56f\n"
- "ld1b { z7.b }, p2/Z, [x11]\n"
- "udot z16.s, z7.b, z0.b[2]\n"
- "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
- "subs x27, x27, #0x4\n"
- "udot z20.s, z7.b, z1.b[2]\n"
- "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
- "udot z24.s, z7.b, z2.b[2]\n"
- "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z28.s, z7.b, z3.b[2]\n"
- "udot z17.s, z8.b, z0.b[2]\n"
- "udot z21.s, z8.b, z1.b[2]\n"
- "udot z25.s, z8.b, z2.b[2]\n"
- "udot z29.s, z8.b, z3.b[2]\n"
- "udot z18.s, z9.b, z0.b[2]\n"
- "udot z22.s, z9.b, z1.b[2]\n"
- "udot z26.s, z9.b, z2.b[2]\n"
- "udot z30.s, z9.b, z3.b[2]\n"
- "udot z19.s, z10.b, z0.b[2]\n"
- "udot z23.s, z10.b, z1.b[2]\n"
- "udot z27.s, z10.b, z2.b[2]\n"
- "udot z31.s, z10.b, z3.b[2]\n"
- "ble 56f\n"
- "ld1b { z4.b }, p2/Z, [x11]\n"
- "udot z16.s, z4.b, z0.b[3]\n"
- "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
- "udot z20.s, z4.b, z1.b[3]\n"
- "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
- "udot z24.s, z4.b, z2.b[3]\n"
- "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
- "addvl x11, x11, #4\n"
- "udot z28.s, z4.b, z3.b[3]\n"
- "udot z17.s, z5.b, z0.b[3]\n"
- "udot z21.s, z5.b, z1.b[3]\n"
- "udot z25.s, z5.b, z2.b[3]\n"
- "udot z29.s, z5.b, z3.b[3]\n"
- "udot z18.s, z6.b, z0.b[3]\n"
- "udot z22.s, z6.b, z1.b[3]\n"
- "udot z26.s, z6.b, z2.b[3]\n"
- "udot z30.s, z6.b, z3.b[3]\n"
- "udot z19.s, z7.b, z0.b[3]\n"
- "udot z23.s, z7.b, z1.b[3]\n"
- "udot z27.s, z7.b, z2.b[3]\n"
- "udot z31.s, z7.b, z3.b[3]\n"
- "56:" // Height 4: Multiply loop: multiply skip
- "tbnz %x[flags], #31, 57f\n"
+ "udot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z28.s, z4.b, z3.b[0]\n"
+ "udot z25.s, z5.b, z2.b[0]\n"
+ "udot z29.s, z5.b, z3.b[0]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "udot z22.s, z6.b, z1.b[0]\n"
+ "udot z26.s, z6.b, z2.b[0]\n"
+ "udot z30.s, z6.b, z3.b[0]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "udot z23.s, z7.b, z1.b[0]\n"
+ "udot z27.s, z7.b, z2.b[0]\n"
+ "udot z31.s, z7.b, z3.b[0]\n"
+ "ble 52f\n"
+ "ld1b { z8.b }, p2/Z, [x28]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "udot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z8.b, z2.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z28.s, z8.b, z3.b[1]\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "udot z21.s, z9.b, z1.b[1]\n"
+ "udot z25.s, z9.b, z2.b[1]\n"
+ "udot z29.s, z9.b, z3.b[1]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z22.s, z10.b, z1.b[1]\n"
+ "udot z26.s, z10.b, z2.b[1]\n"
+ "udot z30.s, z10.b, z3.b[1]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z23.s, z4.b, z1.b[1]\n"
+ "udot z27.s, z4.b, z2.b[1]\n"
+ "udot z31.s, z4.b, z3.b[1]\n"
+ "ble 52f\n"
+ "ld1b { z5.b }, p2/Z, [x28]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "subs x24, x24, #0x4\n"
+ "udot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z5.b, z2.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z28.s, z5.b, z3.b[2]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z25.s, z6.b, z2.b[2]\n"
+ "udot z29.s, z6.b, z3.b[2]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z22.s, z7.b, z1.b[2]\n"
+ "udot z26.s, z7.b, z2.b[2]\n"
+ "udot z30.s, z7.b, z3.b[2]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z23.s, z8.b, z1.b[2]\n"
+ "udot z27.s, z8.b, z2.b[2]\n"
+ "udot z31.s, z8.b, z3.b[2]\n"
+ "ble 52f\n"
+ "ld1b { z9.b }, p2/Z, [x28]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "ld1b { z10.b }, p2/Z, [x28, #1, MUL VL]\n"
+ "udot z20.s, z9.b, z1.b[3]\n"
+ "ld1b { z4.b }, p2/Z, [x28, #2, MUL VL]\n"
+ "udot z24.s, z9.b, z2.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "udot z28.s, z9.b, z3.b[3]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z21.s, z10.b, z1.b[3]\n"
+ "udot z25.s, z10.b, z2.b[3]\n"
+ "udot z29.s, z10.b, z3.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z22.s, z4.b, z1.b[3]\n"
+ "udot z26.s, z4.b, z2.b[3]\n"
+ "udot z30.s, z4.b, z3.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z23.s, z5.b, z1.b[3]\n"
+ "udot z27.s, z5.b, z2.b[3]\n"
+ "udot z31.s, z5.b, z3.b[3]\n"
+ "52:" // Height 4: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 53f\n"
"udot z11.s, z0.b, z15.b\n"
"udot z12.s, z1.b, z15.b\n"
"udot z13.s, z2.b, z15.b\n"
"udot z14.s, z3.b, z15.b\n"
- "57:" // Height 4: Multiply loop: unique 8: skip row sum
- "prfm pldl1keep, [x26, #0x80]\n"
- "add x28, x28, #0x1\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "53:" // Height 4: Multiply loop: unique 8: skip row sum
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "add x25, x25, #0x1\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x28, x19\n"
- "bne 50b\n"
- "tbnz %x[flags], #31, 58f\n"
+ "cmp x25, x19\n"
+ "bne 46b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "add x22, x26, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "tbnz %x[flags], #31, 54f\n"
"add x19, %x[qp], %[b_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"neg z4.s, p2/M, z4.s\n"
@@ -1350,19 +1309,19 @@ void sve_hybrid_u8qa_dot_4x4VL (
"mul z12.s, p2/M, z12.s, z4.s\n"
"mul z13.s, p2/M, z13.s, z4.s\n"
"mul z14.s, p2/M, z14.s, z4.s\n"
- "58:" // Height 4: skip row sum fixup
+ "54:" // Height 4: skip row sum fixup
"add z16.s, z16.s, z11.s\n"
- "ld1w { z0.s }, p2/Z, [x10]\n"
+ "ld1w { z0.s }, p2/Z, [x27]\n"
"orr %x[flags], %x[flags], #0x80000000\n"
"add z17.s, z17.s, z11.s\n"
- "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
- "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+ "add x23, %x[qp], %[per_layer_right_shift]\n"
"add z18.s, z18.s, z11.s\n"
- "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
"add x19, %x[qp], %[per_layer_mul]\n"
"add z19.s, z19.s, z11.s\n"
- "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
- "addvl x10, x10, #4\n"
+ "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
"add z20.s, z20.s, z12.s\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
"add z21.s, z21.s, z12.s\n"
@@ -1389,7 +1348,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"add z26.s, z26.s, z2.s\n"
"add z27.s, z27.s, z3.s\n"
"add z28.s, z28.s, z0.s\n"
- "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "ld1rw { z0.s }, p2/Z, [x23]\n"
"add z29.s, z29.s, z1.s\n"
"add z30.s, z30.s, z2.s\n"
"add z31.s, z31.s, z3.s\n"
@@ -1409,7 +1368,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
- "tbz %x[flags], #5, 59f\n"
+ "tbz %x[flags], #5, 55f\n"
"and z4.d, z16.d, z0.d\n"
"asr z4.s, z4.s, #0x1f\n"
"and z5.d, z17.d, z0.d\n"
@@ -1458,7 +1417,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
"sqadd z29.s, z29.s, z10.s\n"
"sqadd z30.s, z30.s, z4.s\n"
"sqadd z31.s, z31.s, z5.s\n"
- "59:" // Height 4: no shift correction
+ "55:" // Height 4: no shift correction
".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
"add x19, %x[qp], %[c_offset]\n"
"ld1rw { z4.s }, p2/Z, [x19]\n"
@@ -1489,9 +1448,9 @@ void sve_hybrid_u8qa_dot_4x4VL (
"uzp1 z17.h, z18.h, z19.h\n"
"smax z20.s, p2/M, z20.s, z5.s\n"
"uzp1 z16.b, z16.b, z17.b\n"
- "st1b { z16.b }, p1, [x9]\n"
+ "st1b { z16.b }, p1, [x26]\n"
"add z21.s, z21.s, z4.s\n"
- "addvl x9, x9, #1\n"
+ "addvl x26, x26, #1\n"
".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
@@ -1514,61 +1473,58 @@ void sve_hybrid_u8qa_dot_4x4VL (
"uzp1 z21.h, z22.h, z23.h\n"
".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
"uzp1 z20.b, z20.b, z21.b\n"
- "st1b { z20.b }, p1, [x25]\n"
+ "st1b { z20.b }, p1, [x22]\n"
"add z26.s, z26.s, z4.s\n"
- "addvl x25, x25, #1\n"
- "add z27.s, z27.s, z4.s\n"
"smax z25.s, p2/M, z25.s, z5.s\n"
+ "add z27.s, z27.s, z4.s\n"
".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
"smin z26.s, p2/M, z26.s, z6.s\n"
- "smin z27.s, p2/M, z27.s, z6.s\n"
"uzp1 z24.h, z24.h, z25.h\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
"add z28.s, z28.s, z4.s\n"
"smax z26.s, p2/M, z26.s, z5.s\n"
+ ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
"smax z27.s, p2/M, z27.s, z5.s\n"
"smin z28.s, p2/M, z28.s, z6.s\n"
- ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
+ "add z29.s, z29.s, z4.s\n"
"uzp1 z25.h, z26.h, z27.h\n"
"smax z28.s, p2/M, z28.s, z5.s\n"
- "add z29.s, z29.s, z4.s\n"
"add z30.s, z30.s, z4.s\n"
"uzp1 z24.b, z24.b, z25.b\n"
- "st1b { z24.b }, p1, [x23]\n"
+ "st1b { z24.b }, p1, [x21]\n"
"smin z29.s, p2/M, z29.s, z6.s\n"
- "addvl x23, x23, #1\n"
"smin z30.s, p2/M, z30.s, z6.s\n"
".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
"smax z29.s, p2/M, z29.s, z5.s\n"
- "add z31.s, z31.s, z4.s\n"
"smax z30.s, p2/M, z30.s, z5.s\n"
+ "add z31.s, z31.s, z4.s\n"
"uzp1 z28.h, z28.h, z29.h\n"
"smin z31.s, p2/M, z31.s, z6.s\n"
"smax z31.s, p2/M, z31.s, z5.s\n"
"uzp1 z29.h, z30.h, z31.h\n"
"uzp1 z28.b, z28.b, z29.b\n"
- "st1b { z28.b }, p1, [x21]\n"
- "addvl x21, x21, #1\n"
- "60:" // Height 4: Writeback done
- "decw x12, ALL, MUL #4\n"
- "cmp x12, XZR\n"
- "bgt 48b\n"
+ "st1b { z28.b }, p1, [x20]\n"
+ "56:" // Height 4: Writeback done
+ "decw x9, ALL, MUL #4\n"
+ "cmp x9, XZR\n"
+ "bgt 44b\n"
"subs %x[M], %x[M], #0x4\n"
- "beq 62f\n"
+ "beq 58f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 61f\n"
+ "tbz %x[flags], #3, 57f\n"
"add x20, x20, #0x4\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "61:" // Update direct input
+ "57:" // Update direct input
"mov x19, #0x4\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "62:" // Exit
+ "58:" // Exit
: [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index 8433fa605e..4ea1d17c4e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -36,7 +36,6 @@
namespace arm_gemm
{
-
// Actual kernel implementations
void sve_hybrid_u8u32_dot_6x4VL( ARGLIST );
@@ -73,7 +72,6 @@ public:
// Default to the generic kernel
kern_type kernel=sve_hybrid_u8u32_dot_6x4VL;
-
cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
index 6ee636d7f6..97f6665d85 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -80,197 +80,184 @@ void sve_hybrid_u8u32_dot_6x4VL (
"ptrue p5.b\n"
"1:" // Row loop
"cmp %x[M], #0x6\n"
- "bge 61f\n"
+ "bge 56f\n"
"cmp %x[M], #0x4\n"
- "bgt 49f\n"
- "beq 37f\n"
+ "bgt 45f\n"
+ "beq 34f\n"
"cmp %x[M], #0x2\n"
- "bgt 25f\n"
- "beq 13f\n"
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 2f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "b 3f\n"
- "2:" // Height 1: setup direct output
- "mov x13, %x[output_ptr]\n"
- "3:" // Height 1: Column loop
+ "bgt 23f\n"
+ "beq 12f\n"
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "2:" // Height 1: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 4f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "b 5f\n"
- "4:" // Height 1: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 3f\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "b 4f\n"
+ "3:" // Height 1: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
"mov z11.s, #0x0\n"
- "5:" // Height 1: setup done
- "mov x12, #0x0\n"
- "6:" // Height 1: String loop
+ "4:" // Height 1: setup done
+ "mov x27, #0x0\n"
+ "5:" // Height 1: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 7f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "cbnz x12, 8f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "cbnz x27, 7f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "b 8f\n"
- "7:" // Height 1: setup direct input
- "mov x10, %x[input_ptr]\n"
- "8:" // Height 1: input setup done
- "cmp x11, #0x10\n"
- "ble 10f\n"
- "9:" // Height 1: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x25, x25, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x26, #0x10\n"
+ "ble 9f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "cmp x11, #0x10\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "cmp x26, #0x10\n"
"udot z10.s, z6.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"udot z11.s, z7.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"udot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"udot z10.s, z6.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"udot z11.s, z7.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"udot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"udot z10.s, z6.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"udot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z11.s, z7.b, z0.b[3]\n"
- "bgt 9b\n"
- "10:" // Height 1: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 8b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "add x10, x10, #0x10\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z11.s, z7.b, z0.b[0]\n"
- "ble 11f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z9.s, z7.b, z0.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"udot z10.s, z6.b, z0.b[1]\n"
- "addvl x14, x14, #4\n"
+ "addvl x9, x9, #4\n"
"udot z11.s, z7.b, z0.b[1]\n"
- "ble 11f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z9.s, z7.b, z0.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"udot z10.s, z6.b, z0.b[2]\n"
- "addvl x14, x14, #4\n"
+ "addvl x9, x9, #4\n"
"udot z11.s, z7.b, z0.b[2]\n"
- "ble 11f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z11.s, z7.b, z0.b[3]\n"
- "11:" // Height 1: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
+ "10:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 6b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "12:" // Height 1: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 3b\n"
- "b 74f\n"
- "13:" // Height 2
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
- "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 14f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "add x9, x9, x19, LSL #2\n"
- "b 15f\n"
- "14:" // Height 2: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "15:" // Height 2: Column loop
+ "cmp x27, x19\n"
+ "bne 5b\n"
+ "st1w { z8.s }, p4, [x28]\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "11:" // Height 1: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 2b\n"
+ "b 68f\n"
+ "12:" // Height 2
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "13:" // Height 2: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 16f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "b 17f\n"
- "16:" // Height 2: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 14f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 15f\n"
+ "14:" // Height 2: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -279,214 +266,206 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov z13.s, #0x0\n"
"mov z14.s, #0x0\n"
"mov z15.s, #0x0\n"
- "17:" // Height 2: setup done
- "mov x12, #0x0\n"
- "18:" // Height 2: String loop
+ "15:" // Height 2: setup done
+ "mov x27, #0x0\n"
+ "16:" // Height 2: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 19f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 17f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "cbnz x12, 20f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x27, 18f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "b 20f\n"
- "19:" // Height 2: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "20:" // Height 2: input setup done
- "cmp x11, #0x10\n"
- "ble 22f\n"
- "21:" // Height 2: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "b 18f\n"
+ "17:" // Height 2: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "18:" // Height 2: input setup done
+ "cmp x26, #0x10\n"
+ "ble 20f\n"
+ "19:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "cmp x11, #0x10\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "cmp x26, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[0]\n"
"udot z15.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"udot z8.s, z6.b, z0.b[1]\n"
"udot z12.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"udot z11.s, z7.b, z0.b[1]\n"
"udot z15.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"udot z8.s, z6.b, z0.b[2]\n"
"udot z12.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[2]\n"
"udot z15.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"udot z8.s, z6.b, z0.b[3]\n"
"udot z12.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z11.s, z7.b, z0.b[3]\n"
"udot z15.s, z7.b, z1.b[3]\n"
- "bgt 21b\n"
- "22:" // Height 2: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 19b\n"
+ "20:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "add x28, x28, #0x10\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z13.s, z7.b, z1.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z11.s, z7.b, z0.b[0]\n"
"udot z15.s, z7.b, z1.b[0]\n"
- "ble 23f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 21f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
"udot z11.s, z7.b, z0.b[1]\n"
"udot z15.s, z7.b, z1.b[1]\n"
- "ble 23f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 21f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
"udot z11.s, z7.b, z0.b[2]\n"
"udot z15.s, z7.b, z1.b[2]\n"
- "ble 23f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 21f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"udot z12.s, z6.b, z1.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z11.s, z7.b, z0.b[3]\n"
"udot z15.s, z7.b, z1.b[3]\n"
- "23:" // Height 2: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "21:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 18b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "24:" // Height 2: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 15b\n"
- "b 74f\n"
- "25:" // Height 3
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "cmp x27, x19\n"
+ "bne 16b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 26f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "add x27, x27, x19, LSL #2\n"
- "b 27f\n"
- "26:" // Height 3: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "27:" // Height 3: Column loop
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "22:" // Height 2: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 13b\n"
+ "b 68f\n"
+ "23:" // Height 3
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "24:" // Height 3: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 28f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "b 29f\n"
- "28:" // Height 3: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 25f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "b 26f\n"
+ "25:" // Height 3: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -499,267 +478,257 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov z17.s, #0x0\n"
"mov z18.s, #0x0\n"
"mov z19.s, #0x0\n"
- "29:" // Height 3: setup done
- "mov x12, #0x0\n"
- "30:" // Height 3: String loop
+ "26:" // Height 3: setup done
+ "mov x27, #0x0\n"
+ "27:" // Height 3: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 31f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 28f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "cbnz x12, 32f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "cbnz x27, 29f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
- "b 32f\n"
- "31:" // Height 3: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "32:" // Height 3: input setup done
- "cmp x11, #0x10\n"
- "ble 34f\n"
- "33:" // Height 3: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 29f\n"
+ "28:" // Height 3: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "29:" // Height 3: input setup done
+ "cmp x26, #0x10\n"
+ "ble 31f\n"
+ "30:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x23, x23, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
- "cmp x11, #0x10\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+ "cmp x26, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
"udot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[0]\n"
"udot z15.s, z7.b, z1.b[0]\n"
"udot z19.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"udot z8.s, z6.b, z0.b[1]\n"
"udot z12.s, z6.b, z1.b[1]\n"
"udot z16.s, z6.b, z2.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
"udot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
"udot z18.s, z6.b, z2.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"udot z11.s, z7.b, z0.b[1]\n"
"udot z15.s, z7.b, z1.b[1]\n"
"udot z19.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"udot z8.s, z6.b, z0.b[2]\n"
"udot z12.s, z6.b, z1.b[2]\n"
"udot z16.s, z6.b, z2.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
"udot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
"udot z18.s, z6.b, z2.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[2]\n"
"udot z15.s, z7.b, z1.b[2]\n"
"udot z19.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"udot z8.s, z6.b, z0.b[3]\n"
"udot z12.s, z6.b, z1.b[3]\n"
"udot z16.s, z6.b, z2.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
"udot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z18.s, z6.b, z2.b[3]\n"
"udot z11.s, z7.b, z0.b[3]\n"
"udot z15.s, z7.b, z1.b[3]\n"
"udot z19.s, z7.b, z2.b[3]\n"
- "bgt 33b\n"
- "34:" // Height 3: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 30b\n"
+ "31:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "add x26, x26, #0x10\n"
+ "add x23, x23, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
"udot z16.s, z6.b, z2.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z17.s, z7.b, z2.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
"udot z11.s, z7.b, z0.b[0]\n"
"udot z15.s, z7.b, z1.b[0]\n"
"udot z19.s, z7.b, z2.b[0]\n"
- "ble 35f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 32f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[1]\n"
"udot z16.s, z6.b, z2.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
"udot z17.s, z7.b, z2.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
"udot z18.s, z6.b, z2.b[1]\n"
"udot z11.s, z7.b, z0.b[1]\n"
"udot z15.s, z7.b, z1.b[1]\n"
"udot z19.s, z7.b, z2.b[1]\n"
- "ble 35f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 32f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[2]\n"
"udot z16.s, z6.b, z2.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
"udot z17.s, z7.b, z2.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
"udot z18.s, z6.b, z2.b[2]\n"
"udot z11.s, z7.b, z0.b[2]\n"
"udot z15.s, z7.b, z1.b[2]\n"
"udot z19.s, z7.b, z2.b[2]\n"
- "ble 35f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 32f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"udot z12.s, z6.b, z1.b[3]\n"
"udot z16.s, z6.b, z2.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
"udot z17.s, z7.b, z2.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z18.s, z6.b, z2.b[3]\n"
"udot z11.s, z7.b, z0.b[3]\n"
"udot z15.s, z7.b, z1.b[3]\n"
"udot z19.s, z7.b, z2.b[3]\n"
- "35:" // Height 3: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "32:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 30b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "36:" // Height 3: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 27b\n"
- "b 74f\n"
- "37:" // Height 4
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "cmp x27, x19\n"
+ "bne 27b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 38f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "b 39f\n"
- "38:" // Height 4: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "39:" // Height 4: Column loop
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "33:" // Height 3: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 24b\n"
+ "b 68f\n"
+ "34:" // Height 4
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "35:" // Height 4: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 40f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "b 41f\n"
- "40:" // Height 4: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 36f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 37f\n"
+ "36:" // Height 4: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -776,123 +745,123 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov z21.s, #0x0\n"
"mov z22.s, #0x0\n"
"mov z23.s, #0x0\n"
- "41:" // Height 4: setup done
- "mov x12, #0x0\n"
- "42:" // Height 4: String loop
+ "37:" // Height 4: setup done
+ "mov x27, #0x0\n"
+ "38:" // Height 4: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 43f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 39f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "cbnz x12, 44f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "cbnz x27, 40f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
- "b 44f\n"
- "43:" // Height 4: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "44:" // Height 4: input setup done
- "cmp x11, #0x10\n"
- "ble 46f\n"
- "45:" // Height 4: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 40f\n"
+ "39:" // Height 4: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "40:" // Height 4: input setup done
+ "cmp x26, #0x10\n"
+ "ble 42f\n"
+ "41:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x22, x22, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x10\n"
"udot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"udot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
"udot z22.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[0]\n"
"udot z15.s, z7.b, z1.b[0]\n"
"udot z19.s, z7.b, z2.b[0]\n"
"udot z23.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"udot z8.s, z6.b, z0.b[1]\n"
"udot z12.s, z6.b, z1.b[1]\n"
"udot z16.s, z6.b, z2.b[1]\n"
"udot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
"udot z17.s, z7.b, z2.b[1]\n"
"udot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
"udot z18.s, z6.b, z2.b[1]\n"
"udot z22.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"udot z11.s, z7.b, z0.b[1]\n"
"udot z15.s, z7.b, z1.b[1]\n"
"udot z19.s, z7.b, z2.b[1]\n"
"udot z23.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"udot z8.s, z6.b, z0.b[2]\n"
"udot z12.s, z6.b, z1.b[2]\n"
"udot z16.s, z6.b, z2.b[2]\n"
"udot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
"udot z17.s, z7.b, z2.b[2]\n"
"udot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
"udot z18.s, z6.b, z2.b[2]\n"
"udot z22.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[2]\n"
"udot z15.s, z7.b, z1.b[2]\n"
"udot z19.s, z7.b, z2.b[2]\n"
"udot z23.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"udot z8.s, z6.b, z0.b[3]\n"
"udot z12.s, z6.b, z1.b[3]\n"
"udot z16.s, z6.b, z2.b[3]\n"
"udot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
"udot z17.s, z7.b, z2.b[3]\n"
"udot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z18.s, z6.b, z2.b[3]\n"
@@ -901,31 +870,31 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z15.s, z7.b, z1.b[3]\n"
"udot z19.s, z7.b, z2.b[3]\n"
"udot z23.s, z7.b, z3.b[3]\n"
- "bgt 45b\n"
- "46:" // Height 4: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 41b\n"
+ "42:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
- "add x24, x24, #0x10\n"
+ "add x22, x22, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
"udot z17.s, z7.b, z2.b[0]\n"
"udot z20.s, z6.b, z3.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z21.s, z7.b, z3.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
@@ -934,21 +903,21 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z15.s, z7.b, z1.b[0]\n"
"udot z19.s, z7.b, z2.b[0]\n"
"udot z23.s, z7.b, z3.b[0]\n"
- "ble 47f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 43f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[1]\n"
"udot z16.s, z6.b, z2.b[1]\n"
"udot z20.s, z6.b, z3.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
"udot z17.s, z7.b, z2.b[1]\n"
"udot z21.s, z7.b, z3.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
"udot z18.s, z6.b, z2.b[1]\n"
@@ -957,21 +926,21 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z15.s, z7.b, z1.b[1]\n"
"udot z19.s, z7.b, z2.b[1]\n"
"udot z23.s, z7.b, z3.b[1]\n"
- "ble 47f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 43f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[2]\n"
"udot z16.s, z6.b, z2.b[2]\n"
"udot z20.s, z6.b, z3.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
"udot z17.s, z7.b, z2.b[2]\n"
"udot z21.s, z7.b, z3.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
"udot z18.s, z6.b, z2.b[2]\n"
@@ -980,20 +949,20 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z15.s, z7.b, z1.b[2]\n"
"udot z19.s, z7.b, z2.b[2]\n"
"udot z23.s, z7.b, z3.b[2]\n"
- "ble 47f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 43f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"udot z12.s, z6.b, z1.b[3]\n"
"udot z16.s, z6.b, z2.b[3]\n"
"udot z20.s, z6.b, z3.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
"udot z17.s, z7.b, z2.b[3]\n"
"udot z21.s, z7.b, z3.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z18.s, z6.b, z2.b[3]\n"
@@ -1002,94 +971,82 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z15.s, z7.b, z1.b[3]\n"
"udot z19.s, z7.b, z2.b[3]\n"
"udot z23.s, z7.b, z3.b[3]\n"
- "47:" // Height 4: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "43:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 42b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "48:" // Height 4: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 39b\n"
- "b 74f\n"
- "49:" // Height 5
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "cmp x27, x19\n"
+ "bne 38b\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 50f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "b 51f\n"
- "50:" // Height 5: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "51:" // Height 5: Column loop
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x21]\n"
+ "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+ "44:" // Height 4: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 35b\n"
+ "b 68f\n"
+ "45:" // Height 5
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "46:" // Height 5: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 52f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x23]\n"
- "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
- "b 53f\n"
- "52:" // Height 5: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 47f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x20]\n"
+ "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "b 48f\n"
+ "47:" // Height 5: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -1110,143 +1067,143 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov z25.s, #0x0\n"
"mov z26.s, #0x0\n"
"mov z27.s, #0x0\n"
- "53:" // Height 5: setup done
- "mov x12, #0x0\n"
- "54:" // Height 5: String loop
+ "48:" // Height 5: setup done
+ "mov x27, #0x0\n"
+ "49:" // Height 5: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 55f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
- "cbnz x12, 56f\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
+ "cbnz x27, 51f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
- "b 56f\n"
- "55:" // Height 5: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "56:" // Height 5: input setup done
- "cmp x11, #0x10\n"
- "ble 58f\n"
- "57:" // Height 5: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "add x21, x21, x19\n"
+ "b 51f\n"
+ "50:" // Height 5: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "51:" // Height 5: input setup done
+ "cmp x26, #0x10\n"
+ "ble 53f\n"
+ "52:" // Height 5: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "udot z13.s, z7.b, z1.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x21, x21, #0x10\n"
"udot z20.s, z6.b, z3.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x10\n"
"udot z24.s, z6.b, z4.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"udot z21.s, z7.b, z3.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
"udot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"udot z10.s, z6.b, z0.b[0]\n"
- "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
"udot z22.s, z6.b, z3.b[0]\n"
"udot z26.s, z6.b, z4.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[0]\n"
"udot z15.s, z7.b, z1.b[0]\n"
"udot z19.s, z7.b, z2.b[0]\n"
"udot z23.s, z7.b, z3.b[0]\n"
"udot z27.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"udot z8.s, z6.b, z0.b[1]\n"
"udot z12.s, z6.b, z1.b[1]\n"
"udot z16.s, z6.b, z2.b[1]\n"
"udot z20.s, z6.b, z3.b[1]\n"
"udot z24.s, z6.b, z4.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
"udot z17.s, z7.b, z2.b[1]\n"
"udot z21.s, z7.b, z3.b[1]\n"
"udot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
"udot z18.s, z6.b, z2.b[1]\n"
"udot z22.s, z6.b, z3.b[1]\n"
"udot z26.s, z6.b, z4.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"udot z11.s, z7.b, z0.b[1]\n"
"udot z15.s, z7.b, z1.b[1]\n"
"udot z19.s, z7.b, z2.b[1]\n"
"udot z23.s, z7.b, z3.b[1]\n"
"udot z27.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"udot z8.s, z6.b, z0.b[2]\n"
"udot z12.s, z6.b, z1.b[2]\n"
"udot z16.s, z6.b, z2.b[2]\n"
"udot z20.s, z6.b, z3.b[2]\n"
"udot z24.s, z6.b, z4.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
"udot z17.s, z7.b, z2.b[2]\n"
"udot z21.s, z7.b, z3.b[2]\n"
"udot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
"udot z18.s, z6.b, z2.b[2]\n"
"udot z22.s, z6.b, z3.b[2]\n"
"udot z26.s, z6.b, z4.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[2]\n"
"udot z15.s, z7.b, z1.b[2]\n"
"udot z19.s, z7.b, z2.b[2]\n"
"udot z23.s, z7.b, z3.b[2]\n"
"udot z27.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"udot z8.s, z6.b, z0.b[3]\n"
"udot z12.s, z6.b, z1.b[3]\n"
"udot z16.s, z6.b, z2.b[3]\n"
"udot z20.s, z6.b, z3.b[3]\n"
"udot z24.s, z6.b, z4.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
"udot z17.s, z7.b, z2.b[3]\n"
"udot z21.s, z7.b, z3.b[3]\n"
"udot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z18.s, z6.b, z2.b[3]\n"
@@ -1257,35 +1214,35 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z19.s, z7.b, z2.b[3]\n"
"udot z23.s, z7.b, z3.b[3]\n"
"udot z27.s, z7.b, z4.b[3]\n"
- "bgt 57b\n"
- "58:" // Height 5: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 52b\n"
+ "53:" // Height 5: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
- "udot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
"add x22, x22, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "add x21, x21, #0x10\n"
"udot z17.s, z7.b, z2.b[0]\n"
"udot z20.s, z6.b, z3.b[0]\n"
"udot z24.s, z6.b, z4.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z21.s, z7.b, z3.b[0]\n"
"udot z25.s, z7.b, z4.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
@@ -1296,23 +1253,23 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z19.s, z7.b, z2.b[0]\n"
"udot z23.s, z7.b, z3.b[0]\n"
"udot z27.s, z7.b, z4.b[0]\n"
- "ble 59f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 54f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[1]\n"
"udot z16.s, z6.b, z2.b[1]\n"
"udot z20.s, z6.b, z3.b[1]\n"
"udot z24.s, z6.b, z4.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
"udot z17.s, z7.b, z2.b[1]\n"
"udot z21.s, z7.b, z3.b[1]\n"
"udot z25.s, z7.b, z4.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
"udot z18.s, z6.b, z2.b[1]\n"
@@ -1323,23 +1280,23 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z19.s, z7.b, z2.b[1]\n"
"udot z23.s, z7.b, z3.b[1]\n"
"udot z27.s, z7.b, z4.b[1]\n"
- "ble 59f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 54f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[2]\n"
"udot z16.s, z6.b, z2.b[2]\n"
"udot z20.s, z6.b, z3.b[2]\n"
"udot z24.s, z6.b, z4.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
"udot z17.s, z7.b, z2.b[2]\n"
"udot z21.s, z7.b, z3.b[2]\n"
"udot z25.s, z7.b, z4.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
"udot z18.s, z6.b, z2.b[2]\n"
@@ -1350,22 +1307,22 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z19.s, z7.b, z2.b[2]\n"
"udot z23.s, z7.b, z3.b[2]\n"
"udot z27.s, z7.b, z4.b[2]\n"
- "ble 59f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 54f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"udot z12.s, z6.b, z1.b[3]\n"
"udot z16.s, z6.b, z2.b[3]\n"
"udot z20.s, z6.b, z3.b[3]\n"
"udot z24.s, z6.b, z4.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
"udot z17.s, z7.b, z2.b[3]\n"
"udot z21.s, z7.b, z3.b[3]\n"
"udot z25.s, z7.b, z4.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z18.s, z6.b, z2.b[3]\n"
@@ -1376,109 +1333,96 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z19.s, z7.b, z2.b[3]\n"
"udot z23.s, z7.b, z3.b[3]\n"
"udot z27.s, z7.b, z4.b[3]\n"
- "59:" // Height 5: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "54:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 54b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "60:" // Height 5: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 51b\n"
- "b 74f\n"
- "61:" // Height 6
- "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
- "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "cmp x27, x19\n"
+ "bne 49b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x21]\n"
+ "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x20]\n"
+ "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+ "55:" // Height 5: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 46b\n"
+ "b 68f\n"
+ "56:" // Height 6
+ "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x28, %x[output_ptr]\n"
+ "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x20, #0x18\n"
"ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
- "tbz %x[flags], #2, 62f\n"
- "ldr x13, [%x[output_ptr], #0x0]\n"
- "add x13, x13, x19, LSL #2\n"
- "ldr x9, [%x[output_ptr], #0x8]\n"
- "ldr x27, [%x[output_ptr], #0x10]\n"
- "add x9, x9, x19, LSL #2\n"
- "ldr x25, [%x[output_ptr], #0x18]\n"
- "ldr x23, [%x[output_ptr], #0x20]\n"
- "add x27, x27, x19, LSL #2\n"
- "ldr x21, [%x[output_ptr], #0x28]\n"
- "add %x[output_ptr], %x[output_ptr], #0x30\n"
- "add x25, x25, x19, LSL #2\n"
- "add x23, x23, x19, LSL #2\n"
- "add x21, x21, x19, LSL #2\n"
- "b 63f\n"
- "62:" // Height 6: setup direct output
- "mov x13, %x[output_ptr]\n"
- "add x9, x13, x19, LSL #2\n"
- "add x27, x9, x19, LSL #2\n"
- "add x25, x27, x19, LSL #2\n"
- "add x23, x25, x19, LSL #2\n"
- "add x21, x23, x19, LSL #2\n"
- "add %x[output_ptr], x21, x19, LSL #2\n"
- "63:" // Height 6: Column loop
+ "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+ "57:" // Height 6: Column loop
"mov x19, #0x0\n"
- "whilelt p4.s, x19, x15\n"
+ "whilelt p4.s, x19, x10\n"
"incw x19\n"
- "whilelt p3.s, x19, x15\n"
+ "whilelt p3.s, x19, x10\n"
"incw x19\n"
- "whilelt p2.s, x19, x15\n"
+ "whilelt p2.s, x19, x10\n"
"incw x19\n"
- "whilelt p1.s, x19, x15\n"
- "tbz %x[flags], #0, 64f\n"
- "ld1w { z8.s }, p4/Z, [x13]\n"
- "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
- "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
- "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
- "ld1w { z12.s }, p4/Z, [x9]\n"
- "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
- "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
- "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
- "ld1w { z16.s }, p4/Z, [x27]\n"
- "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
- "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
- "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
- "ld1w { z20.s }, p4/Z, [x25]\n"
- "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
- "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
- "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
- "ld1w { z24.s }, p4/Z, [x23]\n"
- "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
- "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
- "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
- "ld1w { z28.s }, p4/Z, [x21]\n"
- "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
- "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
- "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
- "b 65f\n"
- "64:" // Height 6: no accumulate
+ "whilelt p1.s, x19, x10\n"
+ "tbz %x[flags], #0, 58f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "ld1w { z8.s }, p4/Z, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "ld1w { z9.s }, p3/Z, [x28, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "ld1w { z12.s }, p4/Z, [x23]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "add x19, x20, x19, LSL #2\n"
+ "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x22]\n"
+ "ld1w { z17.s }, p3/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x21]\n"
+ "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x20]\n"
+ "ld1w { z25.s }, p3/Z, [x20, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x20, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x20, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x19]\n"
+ "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n"
+ "b 59f\n"
+ "58:" // Height 6: no accumulate
"mov z8.s, #0x0\n"
"mov z9.s, #0x0\n"
"mov z10.s, #0x0\n"
@@ -1503,77 +1447,77 @@ void sve_hybrid_u8u32_dot_6x4VL (
"mov z29.s, #0x0\n"
"mov z30.s, #0x0\n"
"mov z31.s, #0x0\n"
- "65:" // Height 6: setup done
- "mov x12, #0x0\n"
- "66:" // Height 6: String loop
+ "59:" // Height 6: setup done
+ "mov x27, #0x0\n"
+ "60:" // Height 6: String loop
"ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "ldr w11, [x20, x12, LSL #0x2]\n"
- "tbz %x[flags], #3, 67f\n"
- "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "ldr w26, [x20, x27, LSL #0x2]\n"
+ "tbz %x[flags], #3, 61f\n"
+ "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
"add x20, x20, x19, LSL #3\n"
- "ldr x10, [x20, #0x0]\n"
- "ldr x28, [x20, #0x8]\n"
- "ldr x26, [x20, #0x10]\n"
- "ldr x24, [x20, #0x18]\n"
- "ldr x22, [x20, #0x20]\n"
+ "ldr x25, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x23, [x20, #0x10]\n"
+ "ldr x22, [x20, #0x18]\n"
+ "ldr x21, [x20, #0x20]\n"
"ldr x20, [x20, #0x28]\n"
- "cbnz x12, 68f\n"
+ "cbnz x27, 62f\n"
"ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
- "add x10, x10, x19\n"
- "add x28, x28, x19\n"
- "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
"add x24, x24, x19\n"
+ "add x23, x23, x19\n"
"add x22, x22, x19\n"
+ "add x21, x21, x19\n"
"add x20, x20, x19\n"
- "b 68f\n"
- "67:" // Height 6: setup direct input
- "mov x10, %x[input_ptr]\n"
- "add x28, x10, x19\n"
- "add x26, x28, x19\n"
- "add x24, x26, x19\n"
- "add x22, x24, x19\n"
- "add x20, x22, x19\n"
- "68:" // Height 6: input setup done
- "cmp x11, #0x10\n"
- "ble 70f\n"
- "69:" // Height 6: Multiply loop: Main loop head
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "sub x11, x11, #0x10\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "b 62f\n"
+ "61:" // Height 6: setup direct input
+ "mov x25, %x[input_ptr]\n"
+ "add x24, x25, x19\n"
+ "add x23, x24, x19\n"
+ "add x22, x23, x19\n"
+ "add x21, x22, x19\n"
+ "add x20, x21, x19\n"
+ "62:" // Height 6: input setup done
+ "cmp x26, #0x10\n"
+ "ble 64f\n"
+ "63:" // Height 6: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "sub x26, x26, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
"ld1rqb { z5.b }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"udot z20.s, z6.b, z3.b[0]\n"
- "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x25, #0x80]\n"
"add x20, x20, #0x10\n"
"udot z24.s, z6.b, z4.b[0]\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "cmp x11, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x26, #0x10\n"
"udot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z17.s, z7.b, z2.b[0]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"udot z21.s, z7.b, z3.b[0]\n"
- "prfm pldl1keep, [x24, #0x80]\n"
- "udot z25.s, z7.b, z4.b[0]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "udot z25.s, z7.b, z4.b[0]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"udot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
"udot z10.s, z6.b, z0.b[0]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"udot z14.s, z6.b, z1.b[0]\n"
@@ -1581,85 +1525,85 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z22.s, z6.b, z3.b[0]\n"
"udot z26.s, z6.b, z4.b[0]\n"
"udot z30.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[0]\n"
"udot z15.s, z7.b, z1.b[0]\n"
"udot z19.s, z7.b, z2.b[0]\n"
"udot z23.s, z7.b, z3.b[0]\n"
"udot z27.s, z7.b, z4.b[0]\n"
"udot z31.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
"udot z8.s, z6.b, z0.b[1]\n"
"udot z12.s, z6.b, z1.b[1]\n"
"udot z16.s, z6.b, z2.b[1]\n"
"udot z20.s, z6.b, z3.b[1]\n"
"udot z24.s, z6.b, z4.b[1]\n"
"udot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
"udot z17.s, z7.b, z2.b[1]\n"
"udot z21.s, z7.b, z3.b[1]\n"
"udot z25.s, z7.b, z4.b[1]\n"
"udot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
- "addvl x14, x14, #16\n"
+ "ld1b { z7.b }, p5/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
"udot z18.s, z6.b, z2.b[1]\n"
"udot z22.s, z6.b, z3.b[1]\n"
"udot z26.s, z6.b, z4.b[1]\n"
"udot z30.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-8, MUL VL]\n"
"udot z11.s, z7.b, z0.b[1]\n"
"udot z15.s, z7.b, z1.b[1]\n"
"udot z19.s, z7.b, z2.b[1]\n"
"udot z23.s, z7.b, z3.b[1]\n"
"udot z27.s, z7.b, z4.b[1]\n"
"udot z31.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-7, MUL VL]\n"
"udot z8.s, z6.b, z0.b[2]\n"
"udot z12.s, z6.b, z1.b[2]\n"
"udot z16.s, z6.b, z2.b[2]\n"
"udot z20.s, z6.b, z3.b[2]\n"
"udot z24.s, z6.b, z4.b[2]\n"
"udot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-6, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
"udot z17.s, z7.b, z2.b[2]\n"
"udot z21.s, z7.b, z3.b[2]\n"
"udot z25.s, z7.b, z4.b[2]\n"
"udot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-5, MUL VL]\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
"udot z18.s, z6.b, z2.b[2]\n"
"udot z22.s, z6.b, z3.b[2]\n"
"udot z26.s, z6.b, z4.b[2]\n"
"udot z30.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-4, MUL VL]\n"
"udot z11.s, z7.b, z0.b[2]\n"
"udot z15.s, z7.b, z1.b[2]\n"
"udot z19.s, z7.b, z2.b[2]\n"
"udot z23.s, z7.b, z3.b[2]\n"
"udot z27.s, z7.b, z4.b[2]\n"
"udot z31.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-3, MUL VL]\n"
"udot z8.s, z6.b, z0.b[3]\n"
"udot z12.s, z6.b, z1.b[3]\n"
"udot z16.s, z6.b, z2.b[3]\n"
"udot z20.s, z6.b, z3.b[3]\n"
"udot z24.s, z6.b, z4.b[3]\n"
"udot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #-2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
"udot z17.s, z7.b, z2.b[3]\n"
"udot z21.s, z7.b, z3.b[3]\n"
"udot z25.s, z7.b, z4.b[3]\n"
"udot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #-1, MUL VL]\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z18.s, z6.b, z2.b[3]\n"
@@ -1672,39 +1616,39 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z23.s, z7.b, z3.b[3]\n"
"udot z27.s, z7.b, z4.b[3]\n"
"udot z31.s, z7.b, z5.b[3]\n"
- "bgt 69b\n"
- "70:" // Height 6: Multiply loop: Single iteration only
- "ld1b { z6.b }, p5/Z, [x14]\n"
- "whilelt p0.b, XZR, x11\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
- "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "bgt 63b\n"
+ "64:" // Height 6: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x9]\n"
+ "whilelt p0.b, XZR, x26\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x25]\n"
"udot z8.s, z6.b, z0.b[0]\n"
- "ld1rqb { z1.b }, p0/Z, [x28]\n"
- "add x10, x10, #0x10\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x25, x25, #0x10\n"
"udot z9.s, z7.b, z0.b[0]\n"
- "ld1rqb { z2.b }, p0/Z, [x26]\n"
- "add x28, x28, #0x10\n"
+ "ld1rqb { z2.b }, p0/Z, [x23]\n"
+ "add x24, x24, #0x10\n"
"udot z12.s, z6.b, z1.b[0]\n"
- "ld1rqb { z3.b }, p0/Z, [x24]\n"
- "add x26, x26, #0x10\n"
+ "ld1rqb { z3.b }, p0/Z, [x22]\n"
+ "add x23, x23, #0x10\n"
"udot z16.s, z6.b, z2.b[0]\n"
- "ld1rqb { z4.b }, p0/Z, [x22]\n"
- "add x24, x24, #0x10\n"
+ "ld1rqb { z4.b }, p0/Z, [x21]\n"
+ "add x22, x22, #0x10\n"
"udot z13.s, z7.b, z1.b[0]\n"
"ld1rqb { z5.b }, p0/Z, [x20]\n"
- "add x22, x22, #0x10\n"
+ "add x21, x21, #0x10\n"
"udot z20.s, z6.b, z3.b[0]\n"
"add x20, x20, #0x10\n"
"udot z17.s, z7.b, z2.b[0]\n"
"udot z24.s, z6.b, z4.b[0]\n"
"udot z28.s, z6.b, z5.b[0]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z21.s, z7.b, z3.b[0]\n"
"udot z25.s, z7.b, z4.b[0]\n"
"udot z29.s, z7.b, z5.b[0]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[0]\n"
"udot z14.s, z6.b, z1.b[0]\n"
"udot z18.s, z6.b, z2.b[0]\n"
@@ -1717,25 +1661,25 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z23.s, z7.b, z3.b[0]\n"
"udot z27.s, z7.b, z4.b[0]\n"
"udot z31.s, z7.b, z5.b[0]\n"
- "ble 71f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 65f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[1]\n"
"udot z16.s, z6.b, z2.b[1]\n"
"udot z20.s, z6.b, z3.b[1]\n"
"udot z24.s, z6.b, z4.b[1]\n"
"udot z28.s, z6.b, z5.b[1]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[1]\n"
"udot z13.s, z7.b, z1.b[1]\n"
"udot z17.s, z7.b, z2.b[1]\n"
"udot z21.s, z7.b, z3.b[1]\n"
"udot z25.s, z7.b, z4.b[1]\n"
"udot z29.s, z7.b, z5.b[1]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[1]\n"
"udot z14.s, z6.b, z1.b[1]\n"
"udot z18.s, z6.b, z2.b[1]\n"
@@ -1748,25 +1692,25 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z23.s, z7.b, z3.b[1]\n"
"udot z27.s, z7.b, z4.b[1]\n"
"udot z31.s, z7.b, z5.b[1]\n"
- "ble 71f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 65f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
- "subs x11, x11, #0x4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
+ "subs x26, x26, #0x4\n"
"udot z12.s, z6.b, z1.b[2]\n"
"udot z16.s, z6.b, z2.b[2]\n"
"udot z20.s, z6.b, z3.b[2]\n"
"udot z24.s, z6.b, z4.b[2]\n"
"udot z28.s, z6.b, z5.b[2]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[2]\n"
"udot z13.s, z7.b, z1.b[2]\n"
"udot z17.s, z7.b, z2.b[2]\n"
"udot z21.s, z7.b, z3.b[2]\n"
"udot z25.s, z7.b, z4.b[2]\n"
"udot z29.s, z7.b, z5.b[2]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[2]\n"
"udot z14.s, z6.b, z1.b[2]\n"
"udot z18.s, z6.b, z2.b[2]\n"
@@ -1779,24 +1723,24 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z23.s, z7.b, z3.b[2]\n"
"udot z27.s, z7.b, z4.b[2]\n"
"udot z31.s, z7.b, z5.b[2]\n"
- "ble 71f\n"
- "ld1b { z6.b }, p5/Z, [x14]\n"
+ "ble 65f\n"
+ "ld1b { z6.b }, p5/Z, [x9]\n"
"udot z8.s, z6.b, z0.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x9, #1, MUL VL]\n"
"udot z12.s, z6.b, z1.b[3]\n"
"udot z16.s, z6.b, z2.b[3]\n"
"udot z20.s, z6.b, z3.b[3]\n"
"udot z24.s, z6.b, z4.b[3]\n"
"udot z28.s, z6.b, z5.b[3]\n"
- "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
"udot z9.s, z7.b, z0.b[3]\n"
"udot z13.s, z7.b, z1.b[3]\n"
"udot z17.s, z7.b, z2.b[3]\n"
"udot z21.s, z7.b, z3.b[3]\n"
"udot z25.s, z7.b, z4.b[3]\n"
"udot z29.s, z7.b, z5.b[3]\n"
- "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
- "addvl x14, x14, #4\n"
+ "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
"udot z10.s, z6.b, z0.b[3]\n"
"udot z14.s, z6.b, z1.b[3]\n"
"udot z18.s, z6.b, z2.b[3]\n"
@@ -1809,67 +1753,68 @@ void sve_hybrid_u8u32_dot_6x4VL (
"udot z23.s, z7.b, z3.b[3]\n"
"udot z27.s, z7.b, z4.b[3]\n"
"udot z31.s, z7.b, z5.b[3]\n"
- "71:" // Height 6: Multiply loop: multiply skip
- "prfm pldl1keep, [x10, #0x80]\n"
- "add x12, x12, #0x1\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "prfm pldl1keep, [x26, #0x80]\n"
+ "65:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x25, #0x80]\n"
+ "add x27, x27, #0x1\n"
"prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x23, #0x80]\n"
"prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x21, #0x80]\n"
"prfm pldl1keep, [x20, #0x80]\n"
"ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
- "cmp x12, x19\n"
- "bne 66b\n"
- "st1w { z8.s }, p4, [x13]\n"
- "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
- "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
- "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
- "addvl x13, x13, #4\n"
- "st1w { z12.s }, p4, [x9]\n"
- "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
- "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
- "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
- "addvl x9, x9, #4\n"
- "st1w { z16.s }, p4, [x27]\n"
- "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
- "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
- "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
- "addvl x27, x27, #4\n"
- "st1w { z20.s }, p4, [x25]\n"
- "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
- "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
- "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
- "addvl x25, x25, #4\n"
- "st1w { z24.s }, p4, [x23]\n"
- "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
- "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
- "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
- "addvl x23, x23, #4\n"
- "st1w { z28.s }, p4, [x21]\n"
- "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
- "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
- "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
- "addvl x21, x21, #4\n"
- "72:" // Height 6: Writeback done
- "decw x15, ALL, MUL #4\n"
- "cmp x15, XZR\n"
- "bgt 63b\n"
+ "cmp x27, x19\n"
+ "bne 60b\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "st1w { z8.s }, p4, [x28]\n"
+ "add x23, x28, x19, LSL #2\n"
+ "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+ "add x22, x23, x19, LSL #2\n"
+ "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+ "add x21, x22, x19, LSL #2\n"
+ "st1w { z12.s }, p4, [x23]\n"
+ "add x20, x21, x19, LSL #2\n"
+ "st1w { z13.s }, p3, [x23, #1, MUL VL]\n"
+ "add x19, x20, x19, LSL #2\n"
+ "st1w { z14.s }, p2, [x23, #2, MUL VL]\n"
+ "addvl x28, x28, #4\n"
+ "st1w { z15.s }, p1, [x23, #3, MUL VL]\n"
+ "st1w { z16.s }, p4, [x22]\n"
+ "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+ "st1w { z20.s }, p4, [x21]\n"
+ "st1w { z21.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x21, #3, MUL VL]\n"
+ "st1w { z24.s }, p4, [x20]\n"
+ "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+ "st1w { z28.s }, p4, [x19]\n"
+ "st1w { z29.s }, p3, [x19, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x19, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x19, #3, MUL VL]\n"
+ "66:" // Height 6: Writeback done
+ "decw x10, ALL, MUL #4\n"
+ "cmp x10, XZR\n"
+ "bgt 57b\n"
"subs %x[M], %x[M], #0x6\n"
- "beq 74f\n"
+ "beq 68f\n"
"ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
- "tbz %x[flags], #3, 73f\n"
+ "tbz %x[flags], #3, 67f\n"
"add x20, x20, #0x6\n"
"str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
"b 1b\n"
- "73:" // Update direct input
+ "67:" // Update direct input
"mov x19, #0x6\n"
"madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
"b 1b\n"
- "74:" // Exit
+ "68:" // Exit
: [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
: [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
index 8fdd2c920d..bea455ca67 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
@@ -22,9 +22,6 @@
* SOFTWARE.
*/
#pragma once
-#if (defined(__GNUC__) && (__GNUC__ >= 7))
-#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
-#endif
#ifdef __arm__