aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-01-09 18:35:17 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-01-18 13:41:40 +0000
commit7cd26d4a1b14bc4bf7c61496803416ab3d84791f (patch)
tree12cc4a27d7ecebc69a43e96b1f46c7eb05437978 /src/core/NEON/kernels/arm_gemm/kernels
parent3ac2f3a1d9297220d1b0ce920dd13fdd4edcc187 (diff)
downloadComputeLibrary-7cd26d4a1b14bc4bf7c61496803416ab3d84791f.tar.gz
COMPMID-1867: Add NEON/SVE GEMM Hybrid kernels.
Change-Id: Ib40a9921e7f9a6a8be6c38872d6b3a0f24ed0cd3 Reviewed-on: https://review.mlplatform.org/515 Reviewed-by: Anthony Barbier <Anthony.barbier@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp78
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp970
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp2005
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp48
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp46
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp46
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp73
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp2066
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp73
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp4632
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp4632
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp73
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp4264
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp73
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp4004
22 files changed, 23163 insertions, 90 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
index 418a375a61..4ad38cbf62 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,9 +32,9 @@
// Kernel implementation.
//
// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
+// 24x8), the chunks being arranged in a row major fashion.
//
// Note that the intent of this is that either ablocks or bblocks will be 1
// - this construction allows the output loop to proceed in either order.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
new file mode 100644
index 0000000000..0c387ff6df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void a64_sgemm_nativeA_pretransposeB_16x4(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int);
+
+// Native A/Pretranspose B SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPUInfo
+// structure.
+class sgemm_nativeA_pretransposeB_16x4 {
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int);
+
+ /* Desired data layout for B buffer (used for pretranspose) */
+ static const int B_interleave = 16;
+ static const int B_block = 1;
+ static const bool B_transpose = true;
+
+ /* Kernel blocking parameters */
+ static int out_width() {
+ return 16;
+ }
+
+ static int out_height() {
+ return 4;
+ }
+
+ static int k_unroll() {
+ return 1;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 4, 16> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_sgemm_nativeA_pretransposeB_16x4;
+
+ sgemm_nativeA_pretransposeB_16x4(const CPUInfo *ci) {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp
new file mode 100644
index 0000000000..b2516f8797
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp
@@ -0,0 +1,970 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+
+#include <arm_neon.h>
+
+namespace arm_gemm {
+
+void a64_sgemm_nativeA_pretransposeB_16x4(const float *A, int lda, const float *B_panel, float *C, int ldc, float beta, unsigned int numrows, unsigned int numcols, unsigned int K) {
+ const bool oddk = ((K % 8) >= 4);
+ const bool beta0 = (beta == 0.0f);
+ const unsigned int oddones = (K % 4);
+
+ /* Use some small temporary arrays to cope with "ragged" M/N sizes.
+ *
+ * "dummy_A_buf" is used to avoid overreading the A input for ragged M,
+ * and also for output if N is not ragged.
+ *
+ * Since the B input is pretransposed it will be padded as needed, so no
+ * need to worry about overreading that.
+ *
+ * "C_buf" is used to avoid overreading or overwriting the output for
+ * ragged N cases.
+ */
+ float dummy_A_buf[16];
+ float C_buf[64];
+
+ std::memset(dummy_A_buf, 0, sizeof(dummy_A_buf));
+ std::memset(C_buf, 0, sizeof(C_buf));
+
+ for (unsigned int y=0; y<numrows; y+=4) {
+ const float *b_ptr = B_panel;
+ const unsigned int active_rows = std::min(numrows - y, 4U);
+
+ /* Increment values to be used to advance A pointers - these get set
+ * to zero when the corresponding row isn't being used due to ragged
+ * M, so it will just read the dummy buffer repeatedly. Values are
+ * in bytes (8x sizeof(float)). */
+ const unsigned long a_incr1 = (active_rows > 1) ? 32 : 0;
+ const unsigned long a_incr2 = (active_rows > 2) ? 32 : 0;
+ const unsigned long a_incr3 = (active_rows > 3) ? 32 : 0;
+
+ /* Starting points for A pointers on this loop */
+ const float * const a_ptr0_base = A + (y * lda);
+ const float * const a_ptr1_base = (active_rows > 1) ? (a_ptr0_base + lda) : dummy_A_buf;
+ const float * const a_ptr2_base = (active_rows > 2) ? (a_ptr1_base + lda) : dummy_A_buf;
+ const float * const a_ptr3_base = (active_rows > 3) ? (a_ptr2_base + lda) : dummy_A_buf;
+
+ /* Starting points for C pointers on this loop */
+ float *c_ptr0 = C + (y * ldc);
+ float *c_ptr1 = (active_rows > 1) ? (c_ptr0 + ldc) : dummy_A_buf;
+ float *c_ptr2 = (active_rows > 2) ? (c_ptr1 + ldc) : dummy_A_buf;
+ float *c_ptr3 = (active_rows > 3) ? (c_ptr2 + ldc) : dummy_A_buf;
+
+ for (unsigned int x0=0; x0<numcols; x0+=16) {
+ const unsigned int active_cols = std::min(numcols - x0, 16U);
+ const bool use_result_buf = (active_cols < 16);
+
+ /* Reset the A pointers for this loop. */
+ const float *a_ptr0 = a_ptr0_base;
+ const float *a_ptr1 = a_ptr1_base;
+ const float *a_ptr2 = a_ptr2_base;
+ const float *a_ptr3 = a_ptr3_base;
+
+ /* Override C pointers if the result buffer is in use. */
+ if (use_result_buf) {
+ c_ptr0 = C_buf;
+ c_ptr1 = C_buf + 16;
+ c_ptr2 = C_buf + 32;
+ c_ptr3 = C_buf + 48;
+
+ /* If beta is non-zero, prepopulate the result buffer */
+ if (!beta0) {
+ for (unsigned int row=0; row<active_rows; row++) {
+ for (unsigned int col=0; col<active_cols; col++) {
+ C_buf[row * 16 + col] = C[((y + row) * ldc) + (x0 + col)];
+ }
+ }
+ }
+ }
+
+ unsigned int loops = ((K+4)/8) - 1;
+ unsigned int odds = oddones;
+
+ __asm __volatile (
+ "a0 .req v0\n"
+ "a1 .req v1\n"
+ "a2 .req v2\n"
+ "a3 .req v3\n"
+ "a0a .req v4\n"
+ "a1a .req v5\n"
+ "a2a .req v6\n"
+ "a3a .req v7\n"
+ "bb0 .req v8\n"
+ "bb1 .req v9\n"
+ "bb2 .req v10\n"
+ "bb3 .req v11\n"
+ "b0a .req v12\n"
+ "b1a .req v13\n"
+ "b2a .req v14\n"
+ "b3a .req v15\n"
+
+ "a0q .req q0\n"
+ "a1q .req q1\n"
+ "a2q .req q2\n"
+ "a3q .req q3\n"
+ "a0aq .req q4\n"
+ "a1aq .req q5\n"
+ "a2aq .req q6\n"
+ "a3aq .req q7\n"
+ "b0q .req q8\n"
+ "b1q .req q9\n"
+ "b2q .req q10\n"
+ "b3q .req q11\n"
+ "b0aq .req q12\n"
+ "b1aq .req q13\n"
+ "b2aq .req q14\n"
+ "b3aq .req q15\n"
+
+ "movi v16.4s, #0x0\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "movi v21.4s, #0x0\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "movi v24.4s, #0x0\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+ "movi v25.4s, #0x0\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+ "movi v26.4s, #0x0\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+ "cbz %w[beta0], 5f\n"
+ "movi v27.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x40]")
+ "movi v28.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x80]")
+ "movi v29.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0xC0]")
+ "movi v30.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]")
+ "movi v31.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x140]")
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+ ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ ASM_PREFETCH("[%[b_ptr], #0x200]")
+
+ // Skip if no complete loops.
+ "cbz %w[loops], 4f\n"
+ "b 1f\n"
+
+ // If beta is non-zero, need to load and multiply by beta
+ "5:\n"
+ "ld1r {v4.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #16]\n"
+ "ldr q18, [%[c_ptr0], #32]\n"
+ "ldr q19, [%[c_ptr0], #48]\n"
+
+ "ldr q20, [%[c_ptr1]]\n"
+ "fmul v16.4s, v16.4s, v4.4s\n"
+ "ldr q21, [%[c_ptr1], #16]\n"
+ "fmul v17.4s, v17.4s, v4.4s\n"
+ "ldr q22, [%[c_ptr1], #32]\n"
+ "fmul v18.4s, v18.4s, v4.4s\n"
+ "ldr q23, [%[c_ptr1], #48]\n"
+ "fmul v19.4s, v19.4s, v4.4s\n"
+
+ "ldr q24, [%[c_ptr2]]\n"
+ "fmul v20.4s, v20.4s, v4.4s\n"
+ "ldr q25, [%[c_ptr2], #16]\n"
+ "fmul v21.4s, v21.4s, v4.4s\n"
+ "ldr q26, [%[c_ptr2], #32]\n"
+ "fmul v22.4s, v22.4s, v4.4s\n"
+ "ldr q27, [%[c_ptr2], #48]\n"
+ "fmul v23.4s, v23.4s, v4.4s\n"
+
+ "ldr q28, [%[c_ptr3]]\n"
+ "fmul v24.4s, v24.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x40]")
+ "ldr q29, [%[c_ptr3], #16]\n"
+ "fmul v25.4s, v25.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x80]")
+ "ldr q30, [%[c_ptr3], #32]\n"
+ "fmul v26.4s, v26.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0xC0]")
+ "ldr q31, [%[c_ptr3], #48]\n"
+ "fmul v27.4s, v27.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]")
+
+ "fmul v28.4s, v28.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x140]")
+ "fmul v29.4s, v29.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+ "fmul v30.4s, v30.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ "fmul v31.4s, v31.4s, v4.4s\n"
+ ASM_PREFETCH("[%[b_ptr], #0x200]")
+
+ "cbz %w[loops], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x240]")
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr], #64]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #80]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #96]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #112]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x280]")
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr], #128]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "subs %w[loops], %w[loops], #1\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #144]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #160]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #176]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x2C0]")
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr], #192]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "add %[a_ptr1], %[a_ptr1], %[a_incr1]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "add %[a_ptr2], %[a_ptr2], %[a_incr2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #208]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "add %[a_ptr3], %[a_ptr3], %[a_incr3]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[a_ptr0], #0x40]")
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #224]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[a_ptr1], #0x40]")
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #240]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], #512\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr], #-256]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]")
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #-240]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ ASM_PREFETCH("[%[a_ptr2], #0x40]")
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #-224]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #-208]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x140]")
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr], #-192]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #-176]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #-160]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #-144]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr], #-128]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ ASM_PREFETCH("[%[a_ptr3], #0x40]")
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #-112]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #-96]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #-80]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+ "ldr b0q, [%[b_ptr], #-64]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+ "ldr b1q, [%[b_ptr], #-48]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+ "ldr b2q, [%[b_ptr], #-32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+ "ldr b3q, [%[b_ptr], #-16]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ ASM_PREFETCH("[%[b_ptr], #0x200]")
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "bne 1b\n"
+
+ // Skip to here
+ "4:\n"
+
+ // Detached final iteration
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "cbnz %w[oddk], 2f\n" // Deal with odd K before we load a0a
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "add %[a_ptr1], %[a_ptr1], %[a_incr1]\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "add %[a_ptr2], %[a_ptr2], %[a_incr2]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "add %[a_ptr3], %[a_ptr3], %[a_incr3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr0], #0x40]")
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr1], #0x40]")
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr2], #0x40]")
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr3], #0x40]")
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "cbnz %w[odds], 6f\n"
+
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // Odd K case: Just do 4 more.
+ "2:\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #16\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "add %[a_ptr1], %[a_ptr1], #16\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "add %[a_ptr2], %[a_ptr2], #16\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "add %[a_ptr3], %[a_ptr3], #16\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr0], #0x40]")
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr1], #0x40]")
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr2], #0x40]")
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ ASM_PREFETCH("[%[c_ptr3], #0x40]")
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "cbnz %w[odds], 7f\n"
+
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // "Odd ones" - lead in from even
+ "6:\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+ "b 8f\n"
+
+ // "Odd ones" - lead in from odd
+ "7:\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+
+ // "Odd ones" - loop
+ "8:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "bne 8b\n"
+
+ // "Odd ones" - detached final iteration
+ "9:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "str q27, [%[c_ptr2], #48]\n"
+
+ "3:\n"
+ "str q28, [%[c_ptr3]]\n"
+ // Increment C pointers for next loop - this looks odd if we
+ // are using the result buffer, but it's OK as using the
+ // result buffer implies there will be no next loop.
+ "add %[c_ptr0], %[c_ptr0], #64\n"
+ "str q29, [%[c_ptr3], #16]\n"
+ "add %[c_ptr1], %[c_ptr1], %[a_incr1], LSL #1\n"
+ "str q30, [%[c_ptr3], #32]\n"
+ "add %[c_ptr2], %[c_ptr2], %[a_incr2], LSL #1\n"
+ "str q31, [%[c_ptr3], #48]\n"
+ "add %[c_ptr3], %[c_ptr3], %[a_incr3], LSL #1\n"
+
+ : [a_ptr0] "+r" (a_ptr0), [a_ptr1] "+r" (a_ptr1), [a_ptr2] "+r" (a_ptr2), [a_ptr3] "+r" (a_ptr3),
+ [b_ptr] "+r" (b_ptr), [loops] "+r" (loops), [odds] "+r" (odds),
+ [c_ptr0] "+r" (c_ptr0), [c_ptr1] "+r" (c_ptr1), [c_ptr2] "+r" (c_ptr2), [c_ptr3] "+r" (c_ptr3)
+ : [oddk] "r" (oddk), [beta0] "r" (beta0), [betaptr] "r" (&beta),
+ [a_incr1] "r" (a_incr1), [a_incr2] "r" (a_incr2), [a_incr3] "r" (a_incr3)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+ "cc", "memory"
+ );
+
+ /* Copy results from result buffer if needed. */
+ if (use_result_buf) {
+ for (unsigned int row=0; row<active_rows; row++) {
+ for (unsigned int col=0; col<active_cols; col++) {
+ C[((y + row) * ldc) + (x0 + col)] = C_buf[row * 16 + col];
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
new file mode 100644
index 0000000000..2b58b110c0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp32_mla_4VLx4(const float *, int, const float *, float *, int, float, int, int, int);
+
+class hybrid_fp32_mla_4VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<float>() * 4;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
+
+ hybrid_fp32_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000000..b8aa8252d1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
@@ -0,0 +1,2005 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const int K_stride = K;
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(float);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
+ const float *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "2:\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "2:\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "2:\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "mov z28.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z29.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z30.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z31.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "2:\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "b.eq 6f\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "b.eq 6f\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
index 3fd738e673..9d88b60cee 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ public:
/* Kernel blocking parameters */
static int out_width()
{
- return svcnth() * 3;
+ return get_vector_length<__fp16>() * 3;
}
static int out_height()
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
index 92ec888244..517895ca7f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,24 +48,24 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"mov z8.h, #0\n"
"ptrue p0.h\n"
"mov z9.h, #0\n"
- "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
"mov z10.h, #0\n"
- "ld1h z2.h, p0/z, [%[b_ptr]]\n"
"mov z11.h, #0\n"
- "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z12.h, #0\n"
- "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+ "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
"mov z13.h, #0\n"
- "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
+ "ld1h z2.h, p0/z, [%[b_ptr]]\n"
"mov z14.h, #0\n"
- "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
+ "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z15.h, #0\n"
- "add %[a_ptr], %[a_ptr], #0x20\n"
+ "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
"mov z16.h, #0\n"
- "addvl %[b_ptr], %[b_ptr], #6\n"
+ "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
"mov z17.h, #0\n"
+ "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
"mov z18.h, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x20\n"
"mov z19.h, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #6\n"
"mov z20.h, #0\n"
"mov z21.h, #0\n"
"mov z22.h, #0\n"
@@ -199,37 +199,31 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z30.h, z7.h, z1.h[6]\n"
"fmla z31.h, z7.h, z1.h[7]\n"
"fmla z8.h, z2.h, z0.h[0]\n"
- "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z9.h, z2.h, z0.h[1]\n"
"fmla z10.h, z2.h, z0.h[2]\n"
"fmla z11.h, z2.h, z0.h[3]\n"
"fmla z12.h, z2.h, z0.h[4]\n"
+ "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z13.h, z2.h, z0.h[5]\n"
"fmla z14.h, z2.h, z0.h[6]\n"
"fmla z15.h, z2.h, z0.h[7]\n"
"fmla z16.h, z3.h, z0.h[0]\n"
- "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z17.h, z3.h, z0.h[1]\n"
"fmla z18.h, z3.h, z0.h[2]\n"
"fmla z19.h, z3.h, z0.h[3]\n"
"fmla z20.h, z3.h, z0.h[4]\n"
+ "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z21.h, z3.h, z0.h[5]\n"
"fmla z22.h, z3.h, z0.h[6]\n"
"fmla z23.h, z3.h, z0.h[7]\n"
"fmla z24.h, z4.h, z0.h[0]\n"
- "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z25.h, z4.h, z0.h[1]\n"
- "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z26.h, z4.h, z0.h[2]\n"
- "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z27.h, z4.h, z0.h[3]\n"
- "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.h, z4.h, z0.h[4]\n"
- "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.h, z4.h, z0.h[5]\n"
- "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.h, z4.h, z0.h[6]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"b 4f\n"
"3:\n"
@@ -260,39 +254,39 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel,
"fmla z30.h, z4.h, z0.h[6]\n"
"fmla z31.h, z4.h, z0.h[7]\n"
"fmla z8.h, z5.h, z1.h[0]\n"
- "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z9.h, z5.h, z1.h[1]\n"
"fmla z10.h, z5.h, z1.h[2]\n"
"fmla z11.h, z5.h, z1.h[3]\n"
"fmla z12.h, z5.h, z1.h[4]\n"
+ "st1h z8.h, p0, [%[c_ptr]]\n"
"fmla z13.h, z5.h, z1.h[5]\n"
"fmla z14.h, z5.h, z1.h[6]\n"
"fmla z15.h, z5.h, z1.h[7]\n"
"fmla z16.h, z6.h, z1.h[0]\n"
- "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z17.h, z6.h, z1.h[1]\n"
"fmla z18.h, z6.h, z1.h[2]\n"
"fmla z19.h, z6.h, z1.h[3]\n"
"fmla z20.h, z6.h, z1.h[4]\n"
+ "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z21.h, z6.h, z1.h[5]\n"
"fmla z22.h, z6.h, z1.h[6]\n"
"fmla z23.h, z6.h, z1.h[7]\n"
"fmla z24.h, z7.h, z1.h[0]\n"
- "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z25.h, z7.h, z1.h[1]\n"
- "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z26.h, z7.h, z1.h[2]\n"
- "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z27.h, z7.h, z1.h[3]\n"
- "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.h, z7.h, z1.h[4]\n"
- "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.h, z7.h, z1.h[5]\n"
- "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.h, z7.h, z1.h[6]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.h, z7.h, z1.h[7]\n"
"4:\n"
+ "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
index b2327f3070..2e8f261fe1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ public:
/* Kernel blocking parameters */
static int out_width()
{
- return svcntw() * 3;
+ return get_vector_length<float>() * 3;
}
static int out_height()
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
index bb08fc7cb0..88c984018e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,22 +48,22 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"mov z8.s, #0\n"
"ptrue p0.s\n"
"mov z9.s, #0\n"
- "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
"mov z10.s, #0\n"
- "ld1w z4.s, p0/z, [%[b_ptr]]\n"
"mov z11.s, #0\n"
- "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
"mov z12.s, #0\n"
- "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
"mov z13.s, #0\n"
- "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr]]\n"
"mov z14.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
"mov z15.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #3\n"
+ "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z16.s, #0\n"
+ "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
"mov z17.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -204,37 +204,31 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z31.s, z6.s, z3.s[3]\n"
"ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"fmla z8.s, z4.s, z0.s[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z9.s, z4.s, z0.s[1]\n"
"fmla z10.s, z4.s, z0.s[2]\n"
"fmla z11.s, z4.s, z0.s[3]\n"
"fmla z20.s, z4.s, z1.s[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z21.s, z4.s, z1.s[1]\n"
"fmla z22.s, z4.s, z1.s[2]\n"
"fmla z23.s, z4.s, z1.s[3]\n"
"fmla z12.s, z5.s, z0.s[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z13.s, z5.s, z0.s[1]\n"
"fmla z14.s, z5.s, z0.s[2]\n"
"fmla z15.s, z5.s, z0.s[3]\n"
"fmla z24.s, z5.s, z1.s[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z25.s, z5.s, z1.s[1]\n"
"fmla z26.s, z5.s, z1.s[2]\n"
"fmla z27.s, z5.s, z1.s[3]\n"
"fmla z16.s, z6.s, z0.s[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z17.s, z6.s, z0.s[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z18.s, z6.s, z0.s[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z19.s, z6.s, z0.s[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.s, z6.s, z1.s[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.s, z6.s, z1.s[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.s, z6.s, z1.s[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.s, z6.s, z1.s[3]\n"
"b 4f\n"
"3:\n"
@@ -269,39 +263,39 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl
"fmla z31.s, z6.s, z1.s[3]\n"
"ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"fmla z8.s, z4.s, z2.s[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z9.s, z4.s, z2.s[1]\n"
"fmla z10.s, z4.s, z2.s[2]\n"
"fmla z11.s, z4.s, z2.s[3]\n"
"fmla z20.s, z4.s, z3.s[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"fmla z21.s, z4.s, z3.s[1]\n"
"fmla z22.s, z4.s, z3.s[2]\n"
"fmla z23.s, z4.s, z3.s[3]\n"
"fmla z12.s, z5.s, z2.s[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z13.s, z5.s, z2.s[1]\n"
"fmla z14.s, z5.s, z2.s[2]\n"
"fmla z15.s, z5.s, z2.s[3]\n"
"fmla z24.s, z5.s, z3.s[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"fmla z25.s, z5.s, z3.s[1]\n"
"fmla z26.s, z5.s, z3.s[2]\n"
"fmla z27.s, z5.s, z3.s[3]\n"
"fmla z16.s, z6.s, z2.s[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z17.s, z6.s, z2.s[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"fmla z18.s, z6.s, z2.s[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"fmla z19.s, z6.s, z2.s[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"fmla z28.s, z6.s, z3.s[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"fmla z29.s, z6.s, z3.s[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"fmla z30.s, z6.s, z3.s[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"fmla z31.s, z6.s, z3.s[3]\n"
"4:\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
index 91aa567d4a..67154e6a3f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ public:
/* Kernel blocking parameters */
static int out_width()
{
- return svcntw() * 3;
+ return get_vector_length<int32_t>() * 3;
}
static int out_height()
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
index 2e994a13f3..d679c211ef 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,22 +49,22 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"mov z8.s, #0\n"
"ptrue p0.b\n"
"mov z9.s, #0\n"
- "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
"mov z10.s, #0\n"
- "ld1b z4.b, p0/z, [%[b_ptr]]\n"
"mov z11.s, #0\n"
- "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
"mov z12.s, #0\n"
- "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+ "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
"mov z13.s, #0\n"
- "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
+ "ld1b z4.b, p0/z, [%[b_ptr]]\n"
"mov z14.s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
"mov z15.s, #0\n"
- "addvl %[b_ptr], %[b_ptr], #3\n"
+ "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
"mov z16.s, #0\n"
+ "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
"mov z17.s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"mov z18.s, #0\n"
+ "addvl %[b_ptr], %[b_ptr], #3\n"
"mov z19.s, #0\n"
"mov z20.s, #0\n"
"mov z21.s, #0\n"
@@ -205,37 +205,31 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z31.s, z6.b, z3.b[3]\n"
"ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"sdot z8.s, z4.b, z0.b[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z9.s, z4.b, z0.b[1]\n"
"sdot z10.s, z4.b, z0.b[2]\n"
"sdot z11.s, z4.b, z0.b[3]\n"
"sdot z20.s, z4.b, z1.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z21.s, z4.b, z1.b[1]\n"
"sdot z22.s, z4.b, z1.b[2]\n"
"sdot z23.s, z4.b, z1.b[3]\n"
"sdot z12.s, z5.b, z0.b[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z13.s, z5.b, z0.b[1]\n"
"sdot z14.s, z5.b, z0.b[2]\n"
"sdot z15.s, z5.b, z0.b[3]\n"
"sdot z24.s, z5.b, z1.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z25.s, z5.b, z1.b[1]\n"
"sdot z26.s, z5.b, z1.b[2]\n"
"sdot z27.s, z5.b, z1.b[3]\n"
"sdot z16.s, z6.b, z0.b[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z17.s, z6.b, z0.b[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z18.s, z6.b, z0.b[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"sdot z19.s, z6.b, z0.b[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"sdot z28.s, z6.b, z1.b[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z29.s, z6.b, z1.b[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"sdot z30.s, z6.b, z1.b[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"sdot z31.s, z6.b, z1.b[3]\n"
"b 4f\n"
"3:\n"
@@ -270,39 +264,39 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel,
"sdot z31.s, z6.b, z1.b[3]\n"
"ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
"sdot z8.s, z4.b, z2.b[0]\n"
- "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z9.s, z4.b, z2.b[1]\n"
"sdot z10.s, z4.b, z2.b[2]\n"
"sdot z11.s, z4.b, z2.b[3]\n"
"sdot z20.s, z4.b, z3.b[0]\n"
+ "st1w z8.s, p0, [%[c_ptr]]\n"
"sdot z21.s, z4.b, z3.b[1]\n"
"sdot z22.s, z4.b, z3.b[2]\n"
"sdot z23.s, z4.b, z3.b[3]\n"
"sdot z12.s, z5.b, z2.b[0]\n"
- "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z13.s, z5.b, z2.b[1]\n"
"sdot z14.s, z5.b, z2.b[2]\n"
"sdot z15.s, z5.b, z2.b[3]\n"
"sdot z24.s, z5.b, z3.b[0]\n"
+ "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
"sdot z25.s, z5.b, z3.b[1]\n"
"sdot z26.s, z5.b, z3.b[2]\n"
"sdot z27.s, z5.b, z3.b[3]\n"
"sdot z16.s, z6.b, z2.b[0]\n"
- "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z17.s, z6.b, z2.b[1]\n"
- "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
"sdot z18.s, z6.b, z2.b[2]\n"
- "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
"sdot z19.s, z6.b, z2.b[3]\n"
- "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
"sdot z28.s, z6.b, z3.b[0]\n"
- "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
"sdot z29.s, z6.b, z3.b[1]\n"
- "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
"sdot z30.s, z6.b, z3.b[2]\n"
- "addvl %[c_ptr], %[c_ptr], #16\n"
"sdot z31.s, z6.b, z3.b[3]\n"
"4:\n"
+ "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+ "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+ "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+ "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+ "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+ "addvl %[c_ptr], %[c_ptr], #16\n"
"st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
"st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
"st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
index ef457e454f..628c5a868e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ public:
/* Kernel blocking parameters */
static int out_width()
{
- return svcntw() * 3;
+ return get_vector_length<uint32_t>() * 3;
}
static int out_height()
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
new file mode 100644
index 0000000000..fcc80d9fe5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_fp32_mla_4VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+class native_fp32_mla_4VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<float>() * 4;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_fp32_mla_4VLx4;
+
+ native_fp32_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000000..6e225669fc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
@@ -0,0 +1,2066 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(float);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
+ const float *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + x0;
+ long ldbb = ldb * sizeof(float);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "2:\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z25.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z26.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z27.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "2:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ptrue p7.s\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z25.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z26.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z27.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z28.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "fmul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "fmul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "fmul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "fmul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "fmul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "fmul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "2:\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "cbz %[regs], 5f\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "fmla z16.s, z12.s, z4.s[3]\n"
+ "fmla z20.s, z12.s, z5.s[3]\n"
+ "fmla z24.s, z12.s, z6.s[3]\n"
+ "fmla z28.s, z12.s, z7.s[3]\n"
+ "fmla z17.s, z13.s, z4.s[3]\n"
+ "fmla z21.s, z13.s, z5.s[3]\n"
+ "fmla z25.s, z13.s, z6.s[3]\n"
+ "fmla z29.s, z13.s, z7.s[3]\n"
+ "fmla z18.s, z14.s, z4.s[3]\n"
+ "fmla z22.s, z14.s, z5.s[3]\n"
+ "fmla z26.s, z14.s, z6.s[3]\n"
+ "fmla z30.s, z14.s, z7.s[3]\n"
+ "fmla z19.s, z15.s, z4.s[3]\n"
+ "fmla z23.s, z15.s, z5.s[3]\n"
+ "fmla z27.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "b 6f\n"
+ "5:\n"
+ "fmla z16.s, z8.s, z0.s[0]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.s, z8.s, z1.s[0]\n"
+ "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.s, z8.s, z2.s[0]\n"
+ "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+ "fmla z28.s, z8.s, z3.s[0]\n"
+ "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+ "fmla z17.s, z9.s, z0.s[0]\n"
+ "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+ "fmla z21.s, z9.s, z1.s[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.s, z9.s, z2.s[0]\n"
+ "fmla z29.s, z9.s, z3.s[0]\n"
+ "fmla z18.s, z10.s, z0.s[0]\n"
+ "fmla z22.s, z10.s, z1.s[0]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z26.s, z10.s, z2.s[0]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z30.s, z10.s, z3.s[0]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z11.s, z0.s[0]\n"
+ "fmla z23.s, z11.s, z1.s[0]\n"
+ "fmla z27.s, z11.s, z2.s[0]\n"
+ "fmla z31.s, z11.s, z3.s[0]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z0.s[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.s, z12.s, z1.s[1]\n"
+ "fmla z24.s, z12.s, z2.s[1]\n"
+ "fmla z28.s, z12.s, z3.s[1]\n"
+ "fmla z17.s, z13.s, z0.s[1]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.s, z13.s, z1.s[1]\n"
+ "fmla z25.s, z13.s, z2.s[1]\n"
+ "fmla z29.s, z13.s, z3.s[1]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.s, z14.s, z0.s[1]\n"
+ "fmla z22.s, z14.s, z1.s[1]\n"
+ "fmla z26.s, z14.s, z2.s[1]\n"
+ "fmla z30.s, z14.s, z3.s[1]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.s, z15.s, z0.s[1]\n"
+ "fmla z23.s, z15.s, z1.s[1]\n"
+ "fmla z27.s, z15.s, z2.s[1]\n"
+ "fmla z31.s, z15.s, z3.s[1]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z0.s[2]\n"
+ "fmla z20.s, z8.s, z1.s[2]\n"
+ "fmla z24.s, z8.s, z2.s[2]\n"
+ "fmla z28.s, z8.s, z3.s[2]\n"
+ "fmla z17.s, z9.s, z0.s[2]\n"
+ "fmla z21.s, z9.s, z1.s[2]\n"
+ "fmla z25.s, z9.s, z2.s[2]\n"
+ "fmla z29.s, z9.s, z3.s[2]\n"
+ "fmla z18.s, z10.s, z0.s[2]\n"
+ "fmla z22.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z30.s, z10.s, z3.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[2]\n"
+ "fmla z23.s, z11.s, z1.s[2]\n"
+ "fmla z27.s, z11.s, z2.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[2]\n"
+ "fmla z16.s, z12.s, z0.s[3]\n"
+ "fmla z20.s, z12.s, z1.s[3]\n"
+ "fmla z24.s, z12.s, z2.s[3]\n"
+ "fmla z28.s, z12.s, z3.s[3]\n"
+ "fmla z17.s, z13.s, z0.s[3]\n"
+ "fmla z21.s, z13.s, z1.s[3]\n"
+ "fmla z25.s, z13.s, z2.s[3]\n"
+ "fmla z29.s, z13.s, z3.s[3]\n"
+ "fmla z18.s, z14.s, z0.s[3]\n"
+ "fmla z22.s, z14.s, z1.s[3]\n"
+ "fmla z26.s, z14.s, z2.s[3]\n"
+ "fmla z30.s, z14.s, z3.s[3]\n"
+ "fmla z19.s, z15.s, z0.s[3]\n"
+ "fmla z23.s, z15.s, z1.s[3]\n"
+ "fmla z27.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[0]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z24.s, z8.s, z6.s[0]\n"
+ "fmla z28.s, z8.s, z7.s[0]\n"
+ "fmla z17.s, z9.s, z4.s[0]\n"
+ "fmla z21.s, z9.s, z5.s[0]\n"
+ "fmla z25.s, z9.s, z6.s[0]\n"
+ "fmla z29.s, z9.s, z7.s[0]\n"
+ "fmla z18.s, z10.s, z4.s[0]\n"
+ "fmla z22.s, z10.s, z5.s[0]\n"
+ "fmla z26.s, z10.s, z6.s[0]\n"
+ "fmla z30.s, z10.s, z7.s[0]\n"
+ "fmla z19.s, z11.s, z4.s[0]\n"
+ "fmla z23.s, z11.s, z5.s[0]\n"
+ "fmla z27.s, z11.s, z6.s[0]\n"
+ "fmla z31.s, z11.s, z7.s[0]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z12.s, z4.s[1]\n"
+ "fmla z20.s, z12.s, z5.s[1]\n"
+ "fmla z24.s, z12.s, z6.s[1]\n"
+ "fmla z28.s, z12.s, z7.s[1]\n"
+ "fmla z17.s, z13.s, z4.s[1]\n"
+ "fmla z21.s, z13.s, z5.s[1]\n"
+ "fmla z25.s, z13.s, z6.s[1]\n"
+ "fmla z29.s, z13.s, z7.s[1]\n"
+ "fmla z18.s, z14.s, z4.s[1]\n"
+ "fmla z22.s, z14.s, z5.s[1]\n"
+ "fmla z26.s, z14.s, z6.s[1]\n"
+ "fmla z30.s, z14.s, z7.s[1]\n"
+ "fmla z19.s, z15.s, z4.s[1]\n"
+ "fmla z23.s, z15.s, z5.s[1]\n"
+ "fmla z27.s, z15.s, z6.s[1]\n"
+ "fmla z31.s, z15.s, z7.s[1]\n"
+ "b.eq 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.s, z8.s, z4.s[2]\n"
+ "fmla z20.s, z8.s, z5.s[2]\n"
+ "fmla z24.s, z8.s, z6.s[2]\n"
+ "fmla z28.s, z8.s, z7.s[2]\n"
+ "fmla z17.s, z9.s, z4.s[2]\n"
+ "fmla z21.s, z9.s, z5.s[2]\n"
+ "fmla z25.s, z9.s, z6.s[2]\n"
+ "fmla z29.s, z9.s, z7.s[2]\n"
+ "fmla z18.s, z10.s, z4.s[2]\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z26.s, z10.s, z6.s[2]\n"
+ "fmla z30.s, z10.s, z7.s[2]\n"
+ "fmla z19.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z5.s[2]\n"
+ "fmla z27.s, z11.s, z6.s[2]\n"
+ "fmla z31.s, z11.s, z7.s[2]\n"
+ "6:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
new file mode 100644
index 0000000000..f5634e3618
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int);
+
+class native_s8s32_dot_4VLx4
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<int32_t>() * 4;
+ }
+
+ static int k_unroll()
+ {
+ return 4;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_s8s32_dot_4VLx4;
+
+ native_s8s32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000000..9c02d95044
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
@@ -0,0 +1,4632 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0);
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
+
+ for (int y=0; y<M; y+=4) {
+ const int8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(int8_t);
+
+ int32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(int32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
+ const int32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ long odds = odds_count;
+ const int8_t *a_ptr0 = a_ptr0_base;
+ const int8_t *b_ptr0 = B + x0;
+ const int8_t *b_ptr1 = b_ptr0 + ldb;
+ const int8_t *b_ptr2 = b_ptr1 + ldb;
+ const int8_t *b_ptr3 = b_ptr2 + ldb;
+ long ldbb = ldb * sizeof(int8_t) * 4;
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "2:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z21.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z22.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z22.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z23.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z25.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z26.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z28.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
new file mode 100644
index 0000000000..f5ebad8565
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int);
+
+class native_u8u32_dot_4VLx4
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<uint32_t>() * 4;
+ }
+
+ static int k_unroll()
+ {
+ return 4;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_u8u32_dot_4VLx4;
+
+ native_u8u32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000000..7d89948dc1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
@@ -0,0 +1,4632 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0u);
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
+
+ for (int y=0; y<M; y+=4) {
+ const uint8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(uint8_t);
+
+ uint32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
+ const uint32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ long odds = odds_count;
+ const uint8_t *a_ptr0 = a_ptr0_base;
+ const uint8_t *b_ptr0 = B + x0;
+ const uint8_t *b_ptr1 = b_ptr0 + ldb;
+ const uint8_t *b_ptr2 = b_ptr1 + ldb;
+ const uint8_t *b_ptr3 = b_ptr2 + ldb;
+ long ldbb = ldb * sizeof(uint8_t) * 4;
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "2:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z21.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z22.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z22.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "2:\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "whilelt p4.b, %[temp], %[width]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "mov z23.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z25.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z26.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "mov z28.s, #0\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 2f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "2:\n"
+ "cbz %[loops], 3f\n"
+ "4:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 4b\n"
+ "3:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[regs], 5f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 6f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 7f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 8f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 10f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 11f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "11:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 12f\n"
+ "10:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "12:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "b 9f\n"
+ "8:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 13f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 14f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "14:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 15f\n"
+ "13:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "15:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "b 9f\n"
+ "7:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 16f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 17f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "17:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 18f\n"
+ "16:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "18:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 19f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 20f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "20:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 21f\n"
+ "19:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "21:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "b 9f\n"
+ "5:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 22f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 23f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 24f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 25f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 26f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "26:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 27f\n"
+ "25:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "27:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "b 9f\n"
+ "24:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 28f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 29f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "29:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 30f\n"
+ "28:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "30:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "b 9f\n"
+ "23:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 31f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 32f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "32:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "b 33f\n"
+ "31:\n"
+ "mov z13.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z14.b, #0\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "33:\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "mov z12.b, #0\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "zip1 z14.b, z15.b, z8.b\n"
+ "zip2 z15.b, z15.b, z8.b\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "b 9f\n"
+ "22:\n"
+ "cbz %[odds], 9f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 34f\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 35f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "35:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "b 36f\n"
+ "34:\n"
+ "mov z9.b, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z10.b, #0\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "36:\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "mov z8.b, #0\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "9:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
new file mode 100644
index 0000000000..80b216ca14
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_smallK_fp32_mla_1VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+class smallK_fp32_mla_1VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<float>() * 1;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_smallK_fp32_mla_1VLx4;
+
+ smallK_fp32_mla_1VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp
new file mode 100644
index 0000000000..e2cc1d14e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp
@@ -0,0 +1,4264 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_smallK_fp32_mla_1VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+
+ const long loops_count = M / 4;
+ const long oddrow_count = M % 4;
+ const long ldab = lda * sizeof(float);
+ const long ldcb = ldc * sizeof(float);
+ const long odd_depth = K % 4;
+ const float *betaptr = &beta;
+ long ldbb = ldb * sizeof(float);
+
+ for (int x0=0; x0<N; x0+=(get_vector_length<float>() * 1)) {
+ const long width = std::min((unsigned long)N-x0, (get_vector_length<float>() * 1));
+ long loops = loops_count;
+ long oddrows = oddrow_count;
+ long temp = 0;
+ const float *b_ptr0 = B + x0;
+
+ const float *a_ptr0 = A;
+
+ float *c_ptr0 = C + x0;
+
+ switch(K) {
+ case 1:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 5:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 6:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 7:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 8:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 9:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 10:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 11:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 12:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 13:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 14:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 15:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 16:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 17:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 18:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 19:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 20:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 21:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 22:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 23:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z26.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ default:
+ case 24:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z5.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z6.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z7.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z9.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z10.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z11.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z13.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z14.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z15.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z16.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z17.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z18.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z19.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z20.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z21.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z22.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z23.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z24.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z25.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z26.s, p0/z, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1w z27.s, p0/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "fmla z28.s, z27.s, z0.s[3]\n"
+ "fmla z29.s, z27.s, z1.s[3]\n"
+ "fmla z30.s, z27.s, z2.s[3]\n"
+ "fmla z31.s, z27.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "fmla z28.s, z27.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
new file mode 100644
index 0000000000..aa2c522382
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_smallK_hybrid_fp32_mla_1VLx4(const float *, int, const float *, float *, int, float, int, int, int);
+
+class smallK_hybrid_fp32_mla_1VLx4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static int out_height()
+ {
+ return 4;
+ }
+
+ static int out_width()
+ {
+ return get_vector_length<float>() * 1;
+ }
+
+ static int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 1, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx4;
+
+ smallK_hybrid_fp32_mla_1VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp
new file mode 100644
index 0000000000..3e7e713106
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp
@@ -0,0 +1,4004 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_smallK_hybrid_fp32_mla_1VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+
+ const long loops_count = M / 4;
+ const long oddrow_count = M % 4;
+ const long ldab = lda * sizeof(float);
+ const long ldcb = ldc * sizeof(float);
+ const int K_stride = K;
+ const long odd_depth = K % 4;
+ const float *betaptr = &beta;
+
+ for (int x0=0; x0<N; x0+=(get_vector_length<float>() * 1)) {
+ const long width = std::min((unsigned long)N-x0, (get_vector_length<float>() * 1));
+ long loops = loops_count;
+ long oddrows = oddrow_count;
+ long temp = 0;
+ const float *b_ptr0 = B + (K_stride * x0);
+
+ const float *a_ptr0 = A;
+
+ float *c_ptr0 = C + x0;
+
+ switch(K) {
+ case 1:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 5:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 6:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 7:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 8:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 9:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 10:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 11:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 12:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 13:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 14:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 15:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 16:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 17:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 18:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 19:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 20:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 21:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 22:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 23:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ default:
+ case 24:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.s, %[temp], %[odd_depth]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "ptrue p7.s\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "ld1w z4.s, p7/z, [%[b_ptr0]]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "ld1w z20.s, p7/z, [%[b_ptr0]]\n"
+ "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1w z27.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "cbz %[loops], 1f\n"
+ "2:\n"
+ "cbz %[beta0], 3f\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "b 4f\n"
+ "3:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z29.s, p0/z, [c_ptr1]\n"
+ "ld1w z30.s, p0/z, [c_ptr2]\n"
+ "ld1w z31.s, p0/z, [c_ptr3]\n"
+ "4:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z29.s, z4.s, z1.s[0]\n"
+ "fmla z30.s, z4.s, z2.s[0]\n"
+ "fmla z31.s, z4.s, z3.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z29.s, z5.s, z1.s[1]\n"
+ "fmla z30.s, z5.s, z2.s[1]\n"
+ "fmla z31.s, z5.s, z3.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z29.s, z6.s, z1.s[2]\n"
+ "fmla z30.s, z6.s, z2.s[2]\n"
+ "fmla z31.s, z6.s, z3.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "fmla z29.s, z7.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n"
+ "fmla z30.s, z7.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n"
+ "fmla z31.s, z7.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n"
+ "fmla z28.s, z8.s, z0.s[0]\n"
+ "fmla z29.s, z8.s, z1.s[0]\n"
+ "fmla z30.s, z8.s, z2.s[0]\n"
+ "fmla z31.s, z8.s, z3.s[0]\n"
+ "fmla z28.s, z9.s, z0.s[1]\n"
+ "fmla z29.s, z9.s, z1.s[1]\n"
+ "fmla z30.s, z9.s, z2.s[1]\n"
+ "fmla z31.s, z9.s, z3.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[2]\n"
+ "fmla z29.s, z10.s, z1.s[2]\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z11.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "fmla z29.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n"
+ "fmla z30.s, z11.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n"
+ "fmla z28.s, z12.s, z0.s[0]\n"
+ "fmla z29.s, z12.s, z1.s[0]\n"
+ "fmla z30.s, z12.s, z2.s[0]\n"
+ "fmla z31.s, z12.s, z3.s[0]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z1.s[1]\n"
+ "fmla z30.s, z13.s, z2.s[1]\n"
+ "fmla z31.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z14.s, z0.s[2]\n"
+ "fmla z29.s, z14.s, z1.s[2]\n"
+ "fmla z30.s, z14.s, z2.s[2]\n"
+ "fmla z31.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z29.s, z15.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n"
+ "fmla z30.s, z15.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n"
+ "fmla z31.s, z15.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n"
+ "fmla z28.s, z16.s, z0.s[0]\n"
+ "fmla z29.s, z16.s, z1.s[0]\n"
+ "fmla z30.s, z16.s, z2.s[0]\n"
+ "fmla z31.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z17.s, z1.s[1]\n"
+ "fmla z30.s, z17.s, z2.s[1]\n"
+ "fmla z31.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z1.s[2]\n"
+ "fmla z30.s, z18.s, z2.s[2]\n"
+ "fmla z31.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z29.s, z19.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n"
+ "fmla z30.s, z19.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n"
+ "fmla z31.s, z19.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z29.s, z20.s, z1.s[0]\n"
+ "fmla z30.s, z20.s, z2.s[0]\n"
+ "fmla z31.s, z20.s, z3.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z29.s, z21.s, z1.s[1]\n"
+ "fmla z30.s, z21.s, z2.s[1]\n"
+ "fmla z31.s, z21.s, z3.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z29.s, z22.s, z1.s[2]\n"
+ "fmla z30.s, z22.s, z2.s[2]\n"
+ "fmla z31.s, z22.s, z3.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n"
+ "fmla z29.s, z23.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n"
+ "fmla z30.s, z23.s, z2.s[3]\n"
+ "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n"
+ "fmla z31.s, z23.s, z3.s[3]\n"
+ "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n"
+ "fmla z28.s, z24.s, z0.s[0]\n"
+ "add a_ptr1, a_ptr1, %[lda], LSL #2\n"
+ "fmla z29.s, z24.s, z1.s[0]\n"
+ "add a_ptr2, a_ptr2, %[lda], LSL #2\n"
+ "fmla z30.s, z24.s, z2.s[0]\n"
+ "add a_ptr3, a_ptr3, %[lda], LSL #2\n"
+ "fmla z31.s, z24.s, z3.s[0]\n"
+ "fmla z28.s, z25.s, z0.s[1]\n"
+ "fmla z29.s, z25.s, z1.s[1]\n"
+ "fmla z30.s, z25.s, z2.s[1]\n"
+ "fmla z31.s, z25.s, z3.s[1]\n"
+ "fmla z28.s, z26.s, z0.s[2]\n"
+ "fmla z29.s, z26.s, z1.s[2]\n"
+ "fmla z30.s, z26.s, z2.s[2]\n"
+ "fmla z31.s, z26.s, z3.s[2]\n"
+ "fmla z28.s, z27.s, z0.s[3]\n"
+ "fmla z29.s, z27.s, z1.s[3]\n"
+ "fmla z30.s, z27.s, z2.s[3]\n"
+ "fmla z31.s, z27.s, z3.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n"
+ "st1w z29.s, p0, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, %[ldc], LSL #2\n"
+ "st1w z30.s, p0, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, %[ldc], LSL #2\n"
+ "st1w z31.s, p0, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, %[ldc], LSL #2\n"
+ "b.ne 2b\n"
+ "1:\n"
+ "cbz %[oddrows], 5f\n"
+ "6:\n"
+ "cbz %[beta0], 7f\n"
+ "mov z28.s, #0\n"
+ "b 8f\n"
+ "7:\n"
+ "ld1w z28.s, p0/z, [%[c_ptr0]]\n"
+ "8:\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+ "subs %[oddrows], %[oddrows], #0x1\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n"
+ "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n"
+ "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n"
+ "fmla z28.s, z4.s, z0.s[0]\n"
+ "fmla z28.s, z5.s, z0.s[1]\n"
+ "fmla z28.s, z6.s, z0.s[2]\n"
+ "fmla z28.s, z7.s, z0.s[3]\n"
+ "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n"
+ "fmla z28.s, z8.s, z1.s[0]\n"
+ "fmla z28.s, z9.s, z1.s[1]\n"
+ "fmla z28.s, z10.s, z1.s[2]\n"
+ "fmla z28.s, z11.s, z1.s[3]\n"
+ "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], %[lda]\n"
+ "fmla z28.s, z12.s, z2.s[0]\n"
+ "fmla z28.s, z13.s, z2.s[1]\n"
+ "fmla z28.s, z14.s, z2.s[2]\n"
+ "fmla z28.s, z15.s, z2.s[3]\n"
+ "fmla z28.s, z16.s, z3.s[0]\n"
+ "fmla z28.s, z17.s, z3.s[1]\n"
+ "fmla z28.s, z18.s, z3.s[2]\n"
+ "fmla z28.s, z19.s, z3.s[3]\n"
+ "fmla z28.s, z20.s, z0.s[0]\n"
+ "fmla z28.s, z21.s, z0.s[1]\n"
+ "fmla z28.s, z22.s, z0.s[2]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z28.s, z24.s, z1.s[0]\n"
+ "fmla z28.s, z25.s, z1.s[1]\n"
+ "fmla z28.s, z26.s, z1.s[2]\n"
+ "fmla z28.s, z27.s, z1.s[3]\n"
+ "st1w z28.s, p0, [%[c_ptr0]]\n"
+ "add %[c_ptr0], %[c_ptr0], %[ldc]\n"
+ "b.ne 6b\n"
+ "5:\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE