From 7cd26d4a1b14bc4bf7c61496803416ab3d84791f Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 9 Jan 2019 18:35:17 +0000 Subject: COMPMID-1867: Add NEON/SVE GEMM Hybrid kernels. Change-Id: Ib40a9921e7f9a6a8be6c38872d6b3a0f24ed0cd3 Reviewed-on: https://review.mlplatform.org/515 Reviewed-by: Anthony Barbier Tested-by: Arm Jenkins --- .../arm_gemm/kernels/a64_hgemm_24x8/generic.cpp | 6 +- .../a64_sgemm_nativeA_pretransposeB_16x4.hpp | 78 + .../generic.cpp | 970 ++++ .../arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp | 74 + .../kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp | 2005 +++++++++ .../kernels/sve_interleaved_fp16_mla_3VLx8.hpp | 4 +- .../sve_interleaved_fp16_mla_3VLx8/generic.cpp | 48 +- .../kernels/sve_interleaved_fp32_mla_3VLx8.hpp | 4 +- .../sve_interleaved_fp32_mla_3VLx8/generic.cpp | 46 +- .../kernels/sve_interleaved_s8s32_dot_3VLx8.hpp | 4 +- .../sve_interleaved_s8s32_dot_3VLx8/generic.cpp | 46 +- .../kernels/sve_interleaved_u8u32_dot_3VLx8.hpp | 4 +- .../arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp | 73 + .../kernels/sve_native_fp32_mla_4VLx4/generic.cpp | 2066 +++++++++ .../kernels/sve_native_s8s32_dot_4VLx4.hpp | 73 + .../kernels/sve_native_s8s32_dot_4VLx4/generic.cpp | 4632 ++++++++++++++++++++ .../kernels/sve_native_u8u32_dot_4VLx4.hpp | 74 + .../kernels/sve_native_u8u32_dot_4VLx4/generic.cpp | 4632 ++++++++++++++++++++ .../arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp | 73 + .../kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp | 4264 ++++++++++++++++++ .../kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp | 73 + .../sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp | 4004 +++++++++++++++++ 22 files changed, 23163 insertions(+), 90 deletions(-) create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp create mode 100644 src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp (limited to 'src/core/NEON/kernels/arm_gemm/kernels') diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp index 418a375a61..4ad38cbf62 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -32,9 +32,9 @@ // Kernel implementation. // // Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. -// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order. // Assume that "Cpanel" points to a chunk of C output blocks (each size -// 12x8), the chunks being arranged in a row major fashion. +// 24x8), the chunks being arranged in a row major fashion. // // Note that the intent of this is that either ablocks or bblocks will be 1 // - this construction allows the output loop to proceed in either order. diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp new file mode 100644 index 0000000000..0c387ff6df --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +namespace arm_gemm { + +// Actual kernel implementations +void a64_sgemm_nativeA_pretransposeB_16x4(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int); + +// Native A/Pretranspose B SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPUInfo +// structure. +class sgemm_nativeA_pretransposeB_16x4 { +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, float *, int, float, unsigned int, unsigned int, unsigned int); + + /* Desired data layout for B buffer (used for pretranspose) */ + static const int B_interleave = 16; + static const int B_block = 1; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static int out_width() { + return 16; + } + + static int out_height() { + return 4; + } + + static int k_unroll() { + return 1; + } + + StdTransformsFixed transforms = {}; + + // Default to the generic kernel + kern_type kernel=a64_sgemm_nativeA_pretransposeB_16x4; + + sgemm_nativeA_pretransposeB_16x4(const CPUInfo *ci) { + + } +}; + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp new file mode 100644 index 0000000000..b2516f8797 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4/generic.cpp @@ -0,0 +1,970 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +#include +#include +#include + +#include + +namespace arm_gemm { + +void a64_sgemm_nativeA_pretransposeB_16x4(const float *A, int lda, const float *B_panel, float *C, int ldc, float beta, unsigned int numrows, unsigned int numcols, unsigned int K) { + const bool oddk = ((K % 8) >= 4); + const bool beta0 = (beta == 0.0f); + const unsigned int oddones = (K % 4); + + /* Use some small temporary arrays to cope with "ragged" M/N sizes. + * + * "dummy_A_buf" is used to avoid overreading the A input for ragged M, + * and also for output if N is not ragged. + * + * Since the B input is pretransposed it will be padded as needed, so no + * need to worry about overreading that. + * + * "C_buf" is used to avoid overreading or overwriting the output for + * ragged N cases. + */ + float dummy_A_buf[16]; + float C_buf[64]; + + std::memset(dummy_A_buf, 0, sizeof(dummy_A_buf)); + std::memset(C_buf, 0, sizeof(C_buf)); + + for (unsigned int y=0; y 1) ? 32 : 0; + const unsigned long a_incr2 = (active_rows > 2) ? 32 : 0; + const unsigned long a_incr3 = (active_rows > 3) ? 32 : 0; + + /* Starting points for A pointers on this loop */ + const float * const a_ptr0_base = A + (y * lda); + const float * const a_ptr1_base = (active_rows > 1) ? (a_ptr0_base + lda) : dummy_A_buf; + const float * const a_ptr2_base = (active_rows > 2) ? (a_ptr1_base + lda) : dummy_A_buf; + const float * const a_ptr3_base = (active_rows > 3) ? (a_ptr2_base + lda) : dummy_A_buf; + + /* Starting points for C pointers on this loop */ + float *c_ptr0 = C + (y * ldc); + float *c_ptr1 = (active_rows > 1) ? (c_ptr0 + ldc) : dummy_A_buf; + float *c_ptr2 = (active_rows > 2) ? (c_ptr1 + ldc) : dummy_A_buf; + float *c_ptr3 = (active_rows > 3) ? (c_ptr2 + ldc) : dummy_A_buf; + + for (unsigned int x0=0; x0() * 4; + } + + static int k_unroll() + { + return 1; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_fp32_mla_4VLx4; + + hybrid_fp32_mla_4VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp new file mode 100644 index 0000000000..b8aa8252d1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp @@ -0,0 +1,2005 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) { + const long beta0 = (beta == 0.0f); + const int K_stride = K; + const long loops_count = ((K + 4) / 8) - 1; + K -= loops_count * 8; + const long regs_count = (K / 4) - 1; + K -= (regs_count + 1) * 4; + const long leftovers = K; + + for (int y=0; y())) { + const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); + const float *betaptr = β + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = leftovers; + const float *a_ptr0 = a_ptr0_base; + const float *b_ptr0 = B + (K_stride * x0); + + switch(M-y) { + case 1: + __asm __volatile ( + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "mov z18.s, #0\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z19.s, #0\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "2:\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "mov z19.s, #0\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z20.s, #0\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z21.s, #0\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "mov z23.s, #0\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "2:\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "mov z20.s, #0\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z21.s, #0\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z23.s, #0\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "mov z24.s, #0\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "mov z25.s, #0\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "mov z26.s, #0\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z27.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z24.s, p7/m, z24.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z25.s, p7/m, z25.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmul z27.s, p7/m, z27.s, z15.s\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "2:\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z20.s, #0\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "mov z21.s, #0\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z23.s, #0\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z24.s, #0\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "mov z25.s, #0\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "mov z26.s, #0\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "mov z27.s, #0\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "mov z28.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z29.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "mov z30.s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "mov z31.s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z28.s, p0/z, [c_ptr3]\n" + "fmul z24.s, p7/m, z24.s, z15.s\n" + "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "fmul z25.s, p7/m, z25.s, z15.s\n" + "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" + "fmul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" + "fmul z27.s, p7/m, z27.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z28.s, p7/m, z28.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z29.s, p7/m, z29.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmul z30.s, p7/m, z30.s, z15.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmul z31.s, p7/m, z31.s, z15.s\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "2:\n" + "addvl %[b_ptr0], %[b_ptr0], #8\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z28.s, z12.s, z7.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z29.s, z13.s, z7.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z30.s, z14.s, z7.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "fmla z31.s, z15.s, z7.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z28.s, z12.s, z7.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z29.s, z13.s, z7.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z30.s, z14.s, z7.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "fmla z31.s, z15.s, z7.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "cbz %[blocks], 6f\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "b.eq 6f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "b.eq 6f\n" + "ld1w z8.s, p7/z, [%[b_ptr0]]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + "st1w z28.s, p0, [c_ptr3]\n" + "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" + "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" + "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp index 3fd738e673..9d88b60cee 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ public: /* Kernel blocking parameters */ static int out_width() { - return svcnth() * 3; + return get_vector_length<__fp16>() * 3; } static int out_height() diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp index 92ec888244..517895ca7f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,24 +48,24 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "mov z8.h, #0\n" "ptrue p0.h\n" "mov z9.h, #0\n" - "ld1rqh z0.h, p0/z, [%[a_ptr]]\n" "mov z10.h, #0\n" - "ld1h z2.h, p0/z, [%[b_ptr]]\n" "mov z11.h, #0\n" - "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n" "mov z12.h, #0\n" - "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n" + "ld1rqh z0.h, p0/z, [%[a_ptr]]\n" "mov z13.h, #0\n" - "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n" + "ld1h z2.h, p0/z, [%[b_ptr]]\n" "mov z14.h, #0\n" - "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n" + "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n" "mov z15.h, #0\n" - "add %[a_ptr], %[a_ptr], #0x20\n" + "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n" "mov z16.h, #0\n" - "addvl %[b_ptr], %[b_ptr], #6\n" + "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n" "mov z17.h, #0\n" + "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n" "mov z18.h, #0\n" + "add %[a_ptr], %[a_ptr], #0x20\n" "mov z19.h, #0\n" + "addvl %[b_ptr], %[b_ptr], #6\n" "mov z20.h, #0\n" "mov z21.h, #0\n" "mov z22.h, #0\n" @@ -199,37 +199,31 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z30.h, z7.h, z1.h[6]\n" "fmla z31.h, z7.h, z1.h[7]\n" "fmla z8.h, z2.h, z0.h[0]\n" - "st1h z8.h, p0, [%[c_ptr]]\n" "fmla z9.h, z2.h, z0.h[1]\n" "fmla z10.h, z2.h, z0.h[2]\n" "fmla z11.h, z2.h, z0.h[3]\n" "fmla z12.h, z2.h, z0.h[4]\n" + "st1h z8.h, p0, [%[c_ptr]]\n" "fmla z13.h, z2.h, z0.h[5]\n" "fmla z14.h, z2.h, z0.h[6]\n" "fmla z15.h, z2.h, z0.h[7]\n" "fmla z16.h, z3.h, z0.h[0]\n" - "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z17.h, z3.h, z0.h[1]\n" "fmla z18.h, z3.h, z0.h[2]\n" "fmla z19.h, z3.h, z0.h[3]\n" "fmla z20.h, z3.h, z0.h[4]\n" + "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z21.h, z3.h, z0.h[5]\n" "fmla z22.h, z3.h, z0.h[6]\n" "fmla z23.h, z3.h, z0.h[7]\n" "fmla z24.h, z4.h, z0.h[0]\n" - "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z25.h, z4.h, z0.h[1]\n" - "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z26.h, z4.h, z0.h[2]\n" - "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n" "fmla z27.h, z4.h, z0.h[3]\n" - "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n" "fmla z28.h, z4.h, z0.h[4]\n" - "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n" + "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z29.h, z4.h, z0.h[5]\n" - "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n" "fmla z30.h, z4.h, z0.h[6]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "fmla z31.h, z4.h, z0.h[7]\n" "b 4f\n" "3:\n" @@ -260,39 +254,39 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z30.h, z4.h, z0.h[6]\n" "fmla z31.h, z4.h, z0.h[7]\n" "fmla z8.h, z5.h, z1.h[0]\n" - "st1h z8.h, p0, [%[c_ptr]]\n" "fmla z9.h, z5.h, z1.h[1]\n" "fmla z10.h, z5.h, z1.h[2]\n" "fmla z11.h, z5.h, z1.h[3]\n" "fmla z12.h, z5.h, z1.h[4]\n" + "st1h z8.h, p0, [%[c_ptr]]\n" "fmla z13.h, z5.h, z1.h[5]\n" "fmla z14.h, z5.h, z1.h[6]\n" "fmla z15.h, z5.h, z1.h[7]\n" "fmla z16.h, z6.h, z1.h[0]\n" - "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z17.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z1.h[2]\n" "fmla z19.h, z6.h, z1.h[3]\n" "fmla z20.h, z6.h, z1.h[4]\n" + "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z21.h, z6.h, z1.h[5]\n" "fmla z22.h, z6.h, z1.h[6]\n" "fmla z23.h, z6.h, z1.h[7]\n" "fmla z24.h, z7.h, z1.h[0]\n" - "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z25.h, z7.h, z1.h[1]\n" - "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z26.h, z7.h, z1.h[2]\n" - "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n" "fmla z27.h, z7.h, z1.h[3]\n" - "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n" "fmla z28.h, z7.h, z1.h[4]\n" - "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n" + "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z29.h, z7.h, z1.h[5]\n" - "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n" "fmla z30.h, z7.h, z1.h[6]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "fmla z31.h, z7.h, z1.h[7]\n" "4:\n" + "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" + "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n" + "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n" + "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n" + "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n" + "addvl %[c_ptr], %[c_ptr], #16\n" "st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n" "st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n" "st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp index b2327f3070..2e8f261fe1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ public: /* Kernel blocking parameters */ static int out_width() { - return svcntw() * 3; + return get_vector_length() * 3; } static int out_height() diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp index bb08fc7cb0..88c984018e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,22 +48,22 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "mov z8.s, #0\n" "ptrue p0.s\n" "mov z9.s, #0\n" - "ld1rqw z0.s, p0/z, [%[a_ptr]]\n" "mov z10.s, #0\n" - "ld1w z4.s, p0/z, [%[b_ptr]]\n" "mov z11.s, #0\n" - "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n" "mov z12.s, #0\n" - "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n" + "ld1rqw z0.s, p0/z, [%[a_ptr]]\n" "mov z13.s, #0\n" - "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n" + "ld1w z4.s, p0/z, [%[b_ptr]]\n" "mov z14.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n" "mov z15.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #3\n" + "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n" "mov z16.s, #0\n" + "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n" "mov z17.s, #0\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z18.s, #0\n" + "addvl %[b_ptr], %[b_ptr], #3\n" "mov z19.s, #0\n" "mov z20.s, #0\n" "mov z21.s, #0\n" @@ -204,37 +204,31 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z31.s, z6.s, z3.s[3]\n" "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n" "fmla z8.s, z4.s, z0.s[0]\n" - "st1w z8.s, p0, [%[c_ptr]]\n" "fmla z9.s, z4.s, z0.s[1]\n" "fmla z10.s, z4.s, z0.s[2]\n" "fmla z11.s, z4.s, z0.s[3]\n" "fmla z20.s, z4.s, z1.s[0]\n" + "st1w z8.s, p0, [%[c_ptr]]\n" "fmla z21.s, z4.s, z1.s[1]\n" "fmla z22.s, z4.s, z1.s[2]\n" "fmla z23.s, z4.s, z1.s[3]\n" "fmla z12.s, z5.s, z0.s[0]\n" - "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z13.s, z5.s, z0.s[1]\n" "fmla z14.s, z5.s, z0.s[2]\n" "fmla z15.s, z5.s, z0.s[3]\n" "fmla z24.s, z5.s, z1.s[0]\n" + "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z25.s, z5.s, z1.s[1]\n" "fmla z26.s, z5.s, z1.s[2]\n" "fmla z27.s, z5.s, z1.s[3]\n" "fmla z16.s, z6.s, z0.s[0]\n" - "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z17.s, z6.s, z0.s[1]\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z18.s, z6.s, z0.s[2]\n" - "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "fmla z19.s, z6.s, z0.s[3]\n" - "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "fmla z28.s, z6.s, z1.s[0]\n" - "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z29.s, z6.s, z1.s[1]\n" - "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" "fmla z30.s, z6.s, z1.s[2]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "fmla z31.s, z6.s, z1.s[3]\n" "b 4f\n" "3:\n" @@ -269,39 +263,39 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z31.s, z6.s, z1.s[3]\n" "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n" "fmla z8.s, z4.s, z2.s[0]\n" - "st1w z8.s, p0, [%[c_ptr]]\n" "fmla z9.s, z4.s, z2.s[1]\n" "fmla z10.s, z4.s, z2.s[2]\n" "fmla z11.s, z4.s, z2.s[3]\n" "fmla z20.s, z4.s, z3.s[0]\n" + "st1w z8.s, p0, [%[c_ptr]]\n" "fmla z21.s, z4.s, z3.s[1]\n" "fmla z22.s, z4.s, z3.s[2]\n" "fmla z23.s, z4.s, z3.s[3]\n" "fmla z12.s, z5.s, z2.s[0]\n" - "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z13.s, z5.s, z2.s[1]\n" "fmla z14.s, z5.s, z2.s[2]\n" "fmla z15.s, z5.s, z2.s[3]\n" "fmla z24.s, z5.s, z3.s[0]\n" + "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "fmla z25.s, z5.s, z3.s[1]\n" "fmla z26.s, z5.s, z3.s[2]\n" "fmla z27.s, z5.s, z3.s[3]\n" "fmla z16.s, z6.s, z2.s[0]\n" - "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z17.s, z6.s, z2.s[1]\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z18.s, z6.s, z2.s[2]\n" - "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "fmla z19.s, z6.s, z2.s[3]\n" - "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "fmla z28.s, z6.s, z3.s[0]\n" - "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "fmla z29.s, z6.s, z3.s[1]\n" - "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" "fmla z30.s, z6.s, z3.s[2]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "fmla z31.s, z6.s, z3.s[3]\n" "4:\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" + "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" + "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" + "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" + "addvl %[c_ptr], %[c_ptr], #16\n" "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n" "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n" "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp index 91aa567d4a..67154e6a3f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ public: /* Kernel blocking parameters */ static int out_width() { - return svcntw() * 3; + return get_vector_length() * 3; } static int out_height() diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp index 2e994a13f3..d679c211ef 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -49,22 +49,22 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "mov z8.s, #0\n" "ptrue p0.b\n" "mov z9.s, #0\n" - "ld1rqb z0.b, p0/z, [%[a_ptr]]\n" "mov z10.s, #0\n" - "ld1b z4.b, p0/z, [%[b_ptr]]\n" "mov z11.s, #0\n" - "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n" "mov z12.s, #0\n" - "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n" + "ld1rqb z0.b, p0/z, [%[a_ptr]]\n" "mov z13.s, #0\n" - "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n" + "ld1b z4.b, p0/z, [%[b_ptr]]\n" "mov z14.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n" "mov z15.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #3\n" + "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n" "mov z16.s, #0\n" + "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n" "mov z17.s, #0\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z18.s, #0\n" + "addvl %[b_ptr], %[b_ptr], #3\n" "mov z19.s, #0\n" "mov z20.s, #0\n" "mov z21.s, #0\n" @@ -205,37 +205,31 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z31.s, z6.b, z3.b[3]\n" "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" "sdot z8.s, z4.b, z0.b[0]\n" - "st1w z8.s, p0, [%[c_ptr]]\n" "sdot z9.s, z4.b, z0.b[1]\n" "sdot z10.s, z4.b, z0.b[2]\n" "sdot z11.s, z4.b, z0.b[3]\n" "sdot z20.s, z4.b, z1.b[0]\n" + "st1w z8.s, p0, [%[c_ptr]]\n" "sdot z21.s, z4.b, z1.b[1]\n" "sdot z22.s, z4.b, z1.b[2]\n" "sdot z23.s, z4.b, z1.b[3]\n" "sdot z12.s, z5.b, z0.b[0]\n" - "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "sdot z13.s, z5.b, z0.b[1]\n" "sdot z14.s, z5.b, z0.b[2]\n" "sdot z15.s, z5.b, z0.b[3]\n" "sdot z24.s, z5.b, z1.b[0]\n" + "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "sdot z25.s, z5.b, z1.b[1]\n" "sdot z26.s, z5.b, z1.b[2]\n" "sdot z27.s, z5.b, z1.b[3]\n" "sdot z16.s, z6.b, z0.b[0]\n" - "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "sdot z17.s, z6.b, z0.b[1]\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "sdot z18.s, z6.b, z0.b[2]\n" - "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "sdot z19.s, z6.b, z0.b[3]\n" - "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "sdot z28.s, z6.b, z1.b[0]\n" - "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "sdot z29.s, z6.b, z1.b[1]\n" - "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" "sdot z30.s, z6.b, z1.b[2]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "sdot z31.s, z6.b, z1.b[3]\n" "b 4f\n" "3:\n" @@ -270,39 +264,39 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z31.s, z6.b, z1.b[3]\n" "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" "sdot z8.s, z4.b, z2.b[0]\n" - "st1w z8.s, p0, [%[c_ptr]]\n" "sdot z9.s, z4.b, z2.b[1]\n" "sdot z10.s, z4.b, z2.b[2]\n" "sdot z11.s, z4.b, z2.b[3]\n" "sdot z20.s, z4.b, z3.b[0]\n" + "st1w z8.s, p0, [%[c_ptr]]\n" "sdot z21.s, z4.b, z3.b[1]\n" "sdot z22.s, z4.b, z3.b[2]\n" "sdot z23.s, z4.b, z3.b[3]\n" "sdot z12.s, z5.b, z2.b[0]\n" - "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "sdot z13.s, z5.b, z2.b[1]\n" "sdot z14.s, z5.b, z2.b[2]\n" "sdot z15.s, z5.b, z2.b[3]\n" "sdot z24.s, z5.b, z3.b[0]\n" + "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" "sdot z25.s, z5.b, z3.b[1]\n" "sdot z26.s, z5.b, z3.b[2]\n" "sdot z27.s, z5.b, z3.b[3]\n" "sdot z16.s, z6.b, z2.b[0]\n" - "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "sdot z17.s, z6.b, z2.b[1]\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "sdot z18.s, z6.b, z2.b[2]\n" - "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "sdot z19.s, z6.b, z2.b[3]\n" - "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "sdot z28.s, z6.b, z3.b[0]\n" - "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" "sdot z29.s, z6.b, z3.b[1]\n" - "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" "sdot z30.s, z6.b, z3.b[2]\n" - "addvl %[c_ptr], %[c_ptr], #16\n" "sdot z31.s, z6.b, z3.b[3]\n" "4:\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" + "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" + "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" + "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n" + "addvl %[c_ptr], %[c_ptr], #16\n" "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n" "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n" "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp index ef457e454f..628c5a868e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2019 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ public: /* Kernel blocking parameters */ static int out_width() { - return svcntw() * 3; + return get_vector_length() * 3; } static int out_height() diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp new file mode 100644 index 0000000000..fcc80d9fe5 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_native_fp32_mla_4VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int); + +class native_fp32_mla_4VLx4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 4; + } + + static int k_unroll() + { + return 1; + } + + + + // Default to the generic kernel + kern_type kernel=sve_native_fp32_mla_4VLx4; + + native_fp32_mla_4VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp new file mode 100644 index 0000000000..6e225669fc --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp @@ -0,0 +1,2066 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) { + const long beta0 = (beta == 0.0f); + const long loops_count = ((K + 4) / 8) - 1; + K -= loops_count * 8; + const long regs_count = (K / 4) - 1; + K -= (regs_count + 1) * 4; + const long leftovers = K; + + for (int y=0; y())) { + const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); + const float *betaptr = β + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = leftovers; + const float *a_ptr0 = a_ptr0_base; + const float *b_ptr0 = B + x0; + long ldbb = ldb * sizeof(float); + + switch(M-y) { + case 1: + __asm __volatile ( + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "mov z18.s, #0\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z19.s, #0\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "b.ne 4b\n" + "3:\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "mov z19.s, #0\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z20.s, #0\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z21.s, #0\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z22.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z23.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "2:\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "cbz %[loops], 3f\n" + "4:\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "b.ne 4b\n" + "3:\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "mov z20.s, #0\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z21.s, #0\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z23.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z24.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "mov z25.s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "mov z26.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z27.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z24.s, p7/m, z24.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z25.s, p7/m, z25.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmul z27.s, p7/m, z27.s, z15.s\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "2:\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "mov z20.s, #0\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "mov z21.s, #0\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z22.s, #0\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "mov z23.s, #0\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z24.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z25.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "mov z26.s, #0\n" + "add a_ptr2, a_ptr2, #0x10\n" + "mov z27.s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z28.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "fmul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "fmul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "fmul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "fmul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "fmul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "fmul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "fmul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "fmul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z28.s, p0/z, [c_ptr3]\n" + "fmul z24.s, p7/m, z24.s, z15.s\n" + "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "fmul z25.s, p7/m, z25.s, z15.s\n" + "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" + "fmul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" + "fmul z27.s, p7/m, z27.s, z15.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "fmul z28.s, p7/m, z28.s, z15.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "fmul z29.s, p7/m, z29.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "fmul z30.s, p7/m, z30.s, z15.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmul z31.s, p7/m, z31.s, z15.s\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add a_ptr3, a_ptr3, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "2:\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[loops], 3f\n" + "4:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z28.s, z12.s, z7.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z29.s, z13.s, z7.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z30.s, z14.s, z7.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "fmla z31.s, z15.s, z7.s[3]\n" + "b.ne 4b\n" + "3:\n" + "cbz %[regs], 5f\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "fmla z16.s, z12.s, z4.s[3]\n" + "fmla z20.s, z12.s, z5.s[3]\n" + "fmla z24.s, z12.s, z6.s[3]\n" + "fmla z28.s, z12.s, z7.s[3]\n" + "fmla z17.s, z13.s, z4.s[3]\n" + "fmla z21.s, z13.s, z5.s[3]\n" + "fmla z25.s, z13.s, z6.s[3]\n" + "fmla z29.s, z13.s, z7.s[3]\n" + "fmla z18.s, z14.s, z4.s[3]\n" + "fmla z22.s, z14.s, z5.s[3]\n" + "fmla z26.s, z14.s, z6.s[3]\n" + "fmla z30.s, z14.s, z7.s[3]\n" + "fmla z19.s, z15.s, z4.s[3]\n" + "fmla z23.s, z15.s, z5.s[3]\n" + "fmla z27.s, z15.s, z6.s[3]\n" + "fmla z31.s, z15.s, z7.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "b 6f\n" + "5:\n" + "fmla z16.s, z8.s, z0.s[0]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z20.s, z8.s, z1.s[0]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "fmla z24.s, z8.s, z2.s[0]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "fmla z28.s, z8.s, z3.s[0]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "fmla z17.s, z9.s, z0.s[0]\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "fmla z21.s, z9.s, z1.s[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z25.s, z9.s, z2.s[0]\n" + "fmla z29.s, z9.s, z3.s[0]\n" + "fmla z18.s, z10.s, z0.s[0]\n" + "fmla z22.s, z10.s, z1.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "fmla z26.s, z10.s, z2.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z30.s, z10.s, z3.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z11.s, z0.s[0]\n" + "fmla z23.s, z11.s, z1.s[0]\n" + "fmla z27.s, z11.s, z2.s[0]\n" + "fmla z31.s, z11.s, z3.s[0]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z0.s[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "fmla z24.s, z12.s, z2.s[1]\n" + "fmla z28.s, z12.s, z3.s[1]\n" + "fmla z17.s, z13.s, z0.s[1]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "fmla z21.s, z13.s, z1.s[1]\n" + "fmla z25.s, z13.s, z2.s[1]\n" + "fmla z29.s, z13.s, z3.s[1]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z14.s, z0.s[1]\n" + "fmla z22.s, z14.s, z1.s[1]\n" + "fmla z26.s, z14.s, z2.s[1]\n" + "fmla z30.s, z14.s, z3.s[1]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.s, z15.s, z0.s[1]\n" + "fmla z23.s, z15.s, z1.s[1]\n" + "fmla z27.s, z15.s, z2.s[1]\n" + "fmla z31.s, z15.s, z3.s[1]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z0.s[2]\n" + "fmla z20.s, z8.s, z1.s[2]\n" + "fmla z24.s, z8.s, z2.s[2]\n" + "fmla z28.s, z8.s, z3.s[2]\n" + "fmla z17.s, z9.s, z0.s[2]\n" + "fmla z21.s, z9.s, z1.s[2]\n" + "fmla z25.s, z9.s, z2.s[2]\n" + "fmla z29.s, z9.s, z3.s[2]\n" + "fmla z18.s, z10.s, z0.s[2]\n" + "fmla z22.s, z10.s, z1.s[2]\n" + "fmla z26.s, z10.s, z2.s[2]\n" + "fmla z30.s, z10.s, z3.s[2]\n" + "fmla z19.s, z11.s, z0.s[2]\n" + "fmla z23.s, z11.s, z1.s[2]\n" + "fmla z27.s, z11.s, z2.s[2]\n" + "fmla z31.s, z11.s, z3.s[2]\n" + "fmla z16.s, z12.s, z0.s[3]\n" + "fmla z20.s, z12.s, z1.s[3]\n" + "fmla z24.s, z12.s, z2.s[3]\n" + "fmla z28.s, z12.s, z3.s[3]\n" + "fmla z17.s, z13.s, z0.s[3]\n" + "fmla z21.s, z13.s, z1.s[3]\n" + "fmla z25.s, z13.s, z2.s[3]\n" + "fmla z29.s, z13.s, z3.s[3]\n" + "fmla z18.s, z14.s, z0.s[3]\n" + "fmla z22.s, z14.s, z1.s[3]\n" + "fmla z26.s, z14.s, z2.s[3]\n" + "fmla z30.s, z14.s, z3.s[3]\n" + "fmla z19.s, z15.s, z0.s[3]\n" + "fmla z23.s, z15.s, z1.s[3]\n" + "fmla z27.s, z15.s, z2.s[3]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[0]\n" + "fmla z20.s, z8.s, z5.s[0]\n" + "fmla z24.s, z8.s, z6.s[0]\n" + "fmla z28.s, z8.s, z7.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" + "fmla z21.s, z9.s, z5.s[0]\n" + "fmla z25.s, z9.s, z6.s[0]\n" + "fmla z29.s, z9.s, z7.s[0]\n" + "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z22.s, z10.s, z5.s[0]\n" + "fmla z26.s, z10.s, z6.s[0]\n" + "fmla z30.s, z10.s, z7.s[0]\n" + "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z23.s, z11.s, z5.s[0]\n" + "fmla z27.s, z11.s, z6.s[0]\n" + "fmla z31.s, z11.s, z7.s[0]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z20.s, z12.s, z5.s[1]\n" + "fmla z24.s, z12.s, z6.s[1]\n" + "fmla z28.s, z12.s, z7.s[1]\n" + "fmla z17.s, z13.s, z4.s[1]\n" + "fmla z21.s, z13.s, z5.s[1]\n" + "fmla z25.s, z13.s, z6.s[1]\n" + "fmla z29.s, z13.s, z7.s[1]\n" + "fmla z18.s, z14.s, z4.s[1]\n" + "fmla z22.s, z14.s, z5.s[1]\n" + "fmla z26.s, z14.s, z6.s[1]\n" + "fmla z30.s, z14.s, z7.s[1]\n" + "fmla z19.s, z15.s, z4.s[1]\n" + "fmla z23.s, z15.s, z5.s[1]\n" + "fmla z27.s, z15.s, z6.s[1]\n" + "fmla z31.s, z15.s, z7.s[1]\n" + "b.eq 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z20.s, z8.s, z5.s[2]\n" + "fmla z24.s, z8.s, z6.s[2]\n" + "fmla z28.s, z8.s, z7.s[2]\n" + "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z21.s, z9.s, z5.s[2]\n" + "fmla z25.s, z9.s, z6.s[2]\n" + "fmla z29.s, z9.s, z7.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" + "fmla z22.s, z10.s, z5.s[2]\n" + "fmla z26.s, z10.s, z6.s[2]\n" + "fmla z30.s, z10.s, z7.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" + "fmla z23.s, z11.s, z5.s[2]\n" + "fmla z27.s, z11.s, z6.s[2]\n" + "fmla z31.s, z11.s, z7.s[2]\n" + "6:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + "st1w z28.s, p0, [c_ptr3]\n" + "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" + "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" + "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp new file mode 100644 index 0000000000..f5634e3618 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + +#include + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_native_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int); + +class native_s8s32_dot_4VLx4 +{ +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 4; + } + + static int k_unroll() + { + return 4; + } + + + + // Default to the generic kernel + kern_type kernel=sve_native_s8s32_dot_4VLx4; + + native_s8s32_dot_4VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp new file mode 100644 index 0000000000..9c02d95044 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp @@ -0,0 +1,4632 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0); + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + K -= (regs_count + 1) * 16; + const long leftovers = K; + const long blocks_count = K / 4; + const long odds_count = K - (blocks_count * 4); + + for (int y=0; y())) { + const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); + const int32_t *betaptr = β + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = blocks_count; + long odds = odds_count; + const int8_t *a_ptr0 = a_ptr0_base; + const int8_t *b_ptr0 = B + x0; + const int8_t *b_ptr1 = b_ptr0 + ldb; + const int8_t *b_ptr2 = b_ptr1 + ldb; + const int8_t *b_ptr3 = b_ptr2 + ldb; + long ldbb = ldb * sizeof(int8_t) * 4; + + switch(M-y) { + case 1: + __asm __volatile ( + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z18.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z19.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "2:\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z13.b, z13.b, z14.b\n" + "subs %[loops], %[loops], #0x1\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "b 9f\n" + "5:\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z19.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z20.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z21.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z22.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z23.s, #0\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z10.b, z10.b, z8.b\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "subs %[loops], %[loops], #0x1\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "b 9f\n" + "5:\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z20.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z21.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z22.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z23.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z24.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z25.s, #0\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "mov z26.s, #0\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "mov z27.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z24.s, p7/m, z24.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z25.s, p7/m, z25.s, z15.s\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mul z26.s, p7/m, z26.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z27.s, p7/m, z27.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z10.b, z10.b, z8.b\n" + "2:\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip1 z14.b, z15.b, z8.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "subs %[loops], %[loops], #0x1\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "b 9f\n" + "5:\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1rqb z3.b, p7/z, [a_ptr3]\n" + "mov z20.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z21.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z22.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z23.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z24.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z25.s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z26.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "mov z27.s, #0\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "mov z28.s, #0\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z28.s, p0/z, [c_ptr3]\n" + "mul z24.s, p7/m, z24.s, z15.s\n" + "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "mul z25.s, p7/m, z25.s, z15.s\n" + "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" + "mul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" + "mul z27.s, p7/m, z27.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z28.s, p7/m, z28.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z29.s, p7/m, z29.s, z15.s\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mul z30.s, p7/m, z30.s, z15.s\n" + "ld1rqb z3.b, p7/z, [a_ptr3]\n" + "mul z31.s, p7/m, z31.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z13.b, z13.b, z14.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip1 z14.b, z15.b, z8.b\n" + "add a_ptr2, a_ptr2, #0x20\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "subs %[loops], %[loops], #0x1\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z28.s, z12.b, z3.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z29.s, z13.b, z3.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z30.s, z14.b, z3.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" + "sdot z31.s, z15.b, z3.b[3]\n" + "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z28.s, z8.b, z7.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z29.s, z9.b, z7.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z30.s, z10.b, z7.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "sdot z31.s, z11.b, z7.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z28.s, z12.b, z7.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z29.s, z13.b, z7.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z30.s, z14.b, z7.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "sdot z31.s, z15.b, z7.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z28.s, z8.b, z7.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z29.s, z9.b, z7.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z30.s, z10.b, z7.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z31.s, z11.b, z7.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z28.s, z12.b, z7.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z29.s, z13.b, z7.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z30.s, z14.b, z7.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "sdot z31.s, z15.b, z7.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z28.s, z12.b, z3.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z29.s, z13.b, z3.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z30.s, z14.b, z3.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" + "sdot z31.s, z15.b, z3.b[3]\n" + "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z28.s, z8.b, z7.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z29.s, z9.b, z7.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z30.s, z10.b, z7.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "sdot z31.s, z11.b, z7.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z28.s, z12.b, z7.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z29.s, z13.b, z7.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z30.s, z14.b, z7.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "sdot z31.s, z15.b, z7.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z28.s, z8.b, z7.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z29.s, z9.b, z7.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z30.s, z10.b, z7.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z31.s, z11.b, z7.b[2]\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z28.s, z12.b, z7.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z29.s, z13.b, z7.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z30.s, z14.b, z7.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "sdot z31.s, z15.b, z7.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z28.s, z12.b, z3.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z29.s, z13.b, z3.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z30.s, z14.b, z3.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "sdot z31.s, z15.b, z3.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "b 9f\n" + "5:\n" + "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "sdot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p6/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "sdot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "sdot z26.s, z10.b, z2.b[0]\n" + "sdot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z23.s, z11.b, z1.b[0]\n" + "sdot z27.s, z11.b, z2.b[0]\n" + "sdot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "sdot z20.s, z12.b, z1.b[1]\n" + "sdot z24.s, z12.b, z2.b[1]\n" + "sdot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "sdot z21.s, z13.b, z1.b[1]\n" + "sdot z25.s, z13.b, z2.b[1]\n" + "sdot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "sdot z18.s, z14.b, z0.b[1]\n" + "sdot z22.s, z14.b, z1.b[1]\n" + "sdot z26.s, z14.b, z2.b[1]\n" + "sdot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "sdot z19.s, z15.b, z0.b[1]\n" + "sdot z23.s, z15.b, z1.b[1]\n" + "sdot z27.s, z15.b, z2.b[1]\n" + "sdot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z16.s, z8.b, z0.b[2]\n" + "sdot z20.s, z8.b, z1.b[2]\n" + "sdot z24.s, z8.b, z2.b[2]\n" + "sdot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "sdot z17.s, z9.b, z0.b[2]\n" + "sdot z21.s, z9.b, z1.b[2]\n" + "sdot z25.s, z9.b, z2.b[2]\n" + "sdot z29.s, z9.b, z3.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z18.s, z10.b, z0.b[2]\n" + "sdot z22.s, z10.b, z1.b[2]\n" + "sdot z26.s, z10.b, z2.b[2]\n" + "sdot z30.s, z10.b, z3.b[2]\n" + "sdot z19.s, z11.b, z0.b[2]\n" + "sdot z23.s, z11.b, z1.b[2]\n" + "sdot z27.s, z11.b, z2.b[2]\n" + "sdot z31.s, z11.b, z3.b[2]\n" + "sdot z16.s, z12.b, z0.b[3]\n" + "sdot z20.s, z12.b, z1.b[3]\n" + "sdot z24.s, z12.b, z2.b[3]\n" + "sdot z28.s, z12.b, z3.b[3]\n" + "sdot z17.s, z13.b, z0.b[3]\n" + "sdot z21.s, z13.b, z1.b[3]\n" + "sdot z25.s, z13.b, z2.b[3]\n" + "sdot z29.s, z13.b, z3.b[3]\n" + "sdot z18.s, z14.b, z0.b[3]\n" + "sdot z22.s, z14.b, z1.b[3]\n" + "sdot z26.s, z14.b, z2.b[3]\n" + "sdot z30.s, z14.b, z3.b[3]\n" + "sdot z19.s, z15.b, z0.b[3]\n" + "sdot z23.s, z15.b, z1.b[3]\n" + "sdot z27.s, z15.b, z2.b[3]\n" + "sdot z31.s, z15.b, z3.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z28.s, z8.b, z7.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z29.s, z9.b, z7.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z30.s, z10.b, z7.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "sdot z31.s, z11.b, z7.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z28.s, z12.b, z7.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z29.s, z13.b, z7.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z30.s, z14.b, z7.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "sdot z31.s, z15.b, z7.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z28.s, z8.b, z7.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z29.s, z9.b, z7.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z30.s, z10.b, z7.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z31.s, z11.b, z7.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[3]\n" + "sdot z20.s, z12.b, z5.b[3]\n" + "sdot z24.s, z12.b, z6.b[3]\n" + "sdot z28.s, z12.b, z7.b[3]\n" + "sdot z17.s, z13.b, z4.b[3]\n" + "sdot z21.s, z13.b, z5.b[3]\n" + "sdot z25.s, z13.b, z6.b[3]\n" + "sdot z29.s, z13.b, z7.b[3]\n" + "sdot z18.s, z14.b, z4.b[3]\n" + "sdot z22.s, z14.b, z5.b[3]\n" + "sdot z26.s, z14.b, z6.b[3]\n" + "sdot z30.s, z14.b, z7.b[3]\n" + "sdot z19.s, z15.b, z4.b[3]\n" + "sdot z23.s, z15.b, z5.b[3]\n" + "sdot z27.s, z15.b, z6.b[3]\n" + "sdot z31.s, z15.b, z7.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[2]\n" + "sdot z20.s, z8.b, z5.b[2]\n" + "sdot z24.s, z8.b, z6.b[2]\n" + "sdot z28.s, z8.b, z7.b[2]\n" + "sdot z17.s, z9.b, z4.b[2]\n" + "sdot z21.s, z9.b, z5.b[2]\n" + "sdot z25.s, z9.b, z6.b[2]\n" + "sdot z29.s, z9.b, z7.b[2]\n" + "sdot z18.s, z10.b, z4.b[2]\n" + "sdot z22.s, z10.b, z5.b[2]\n" + "sdot z26.s, z10.b, z6.b[2]\n" + "sdot z30.s, z10.b, z7.b[2]\n" + "sdot z19.s, z11.b, z4.b[2]\n" + "sdot z23.s, z11.b, z5.b[2]\n" + "sdot z27.s, z11.b, z6.b[2]\n" + "sdot z31.s, z11.b, z7.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "sdot z16.s, z12.b, z4.b[1]\n" + "sdot z20.s, z12.b, z5.b[1]\n" + "sdot z24.s, z12.b, z6.b[1]\n" + "sdot z28.s, z12.b, z7.b[1]\n" + "sdot z17.s, z13.b, z4.b[1]\n" + "sdot z21.s, z13.b, z5.b[1]\n" + "sdot z25.s, z13.b, z6.b[1]\n" + "sdot z29.s, z13.b, z7.b[1]\n" + "sdot z18.s, z14.b, z4.b[1]\n" + "sdot z22.s, z14.b, z5.b[1]\n" + "sdot z26.s, z14.b, z6.b[1]\n" + "sdot z30.s, z14.b, z7.b[1]\n" + "sdot z19.s, z15.b, z4.b[1]\n" + "sdot z23.s, z15.b, z5.b[1]\n" + "sdot z27.s, z15.b, z6.b[1]\n" + "sdot z31.s, z15.b, z7.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "sdot z16.s, z8.b, z4.b[0]\n" + "sdot z20.s, z8.b, z5.b[0]\n" + "sdot z24.s, z8.b, z6.b[0]\n" + "sdot z28.s, z8.b, z7.b[0]\n" + "sdot z17.s, z9.b, z4.b[0]\n" + "sdot z21.s, z9.b, z5.b[0]\n" + "sdot z25.s, z9.b, z6.b[0]\n" + "sdot z29.s, z9.b, z7.b[0]\n" + "sdot z18.s, z10.b, z4.b[0]\n" + "sdot z22.s, z10.b, z5.b[0]\n" + "sdot z26.s, z10.b, z6.b[0]\n" + "sdot z30.s, z10.b, z7.b[0]\n" + "sdot z19.s, z11.b, z4.b[0]\n" + "sdot z23.s, z11.b, z5.b[0]\n" + "sdot z27.s, z11.b, z6.b[0]\n" + "sdot z31.s, z11.b, z7.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + "st1w z28.s, p0, [c_ptr3]\n" + "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" + "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" + "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp new file mode 100644 index 0000000000..f5ebad8565 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + +#include + + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_native_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int); + +class native_u8u32_dot_4VLx4 +{ +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 4; + } + + static int k_unroll() + { + return 4; + } + + + + // Default to the generic kernel + kern_type kernel=sve_native_u8u32_dot_4VLx4; + + native_u8u32_dot_4VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp new file mode 100644 index 0000000000..7d89948dc1 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp @@ -0,0 +1,4632 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + +#include +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) { + const long beta0 = (beta == 0u); + const long loops_count = ((K + 16) / 32) - 1; + K -= loops_count * 32; + const long regs_count = (K / 16) - 1; + K -= (regs_count + 1) * 16; + const long leftovers = K; + const long blocks_count = K / 4; + const long odds_count = K - (blocks_count * 4); + + for (int y=0; y())) { + const long width = std::min((unsigned long)N-x0, (4 * get_vector_length())); + const uint32_t *betaptr = β + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = blocks_count; + long odds = odds_count; + const uint8_t *a_ptr0 = a_ptr0_base; + const uint8_t *b_ptr0 = B + x0; + const uint8_t *b_ptr1 = b_ptr0 + ldb; + const uint8_t *b_ptr2 = b_ptr1 + ldb; + const uint8_t *b_ptr3 = b_ptr2 + ldb; + long ldbb = ldb * sizeof(uint8_t) * 4; + + switch(M-y) { + case 1: + __asm __volatile ( + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z18.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z19.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "2:\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z13.b, z13.b, z14.b\n" + "subs %[loops], %[loops], #0x1\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "b 9f\n" + "5:\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z19.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z20.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z21.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z22.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z23.s, #0\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z10.b, z10.b, z8.b\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "subs %[loops], %[loops], #0x1\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z1.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z22.s, z10.b, z1.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z23.s, z11.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "b 9f\n" + "5:\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z23.s, z11.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z20.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z21.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z22.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z23.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z24.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z25.s, #0\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "mov z26.s, #0\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "mov z27.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z24.s, p7/m, z24.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z25.s, p7/m, z25.s, z15.s\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mul z26.s, p7/m, z26.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mul z27.s, p7/m, z27.s, z15.s\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z10.b, z10.b, z8.b\n" + "2:\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip1 z14.b, z15.b, z8.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "udot z27.s, z11.b, z2.b[0]\n" + "subs %[loops], %[loops], #0x1\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "b 9f\n" + "5:\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.b, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "whilelt p4.b, %[temp], %[width]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.b\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "whilelt p1.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p2.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "whilelt p3.s, %[temp], %[width]\n" + "cbz %[beta0], 1f\n" + "mov z16.s, #0\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mov z17.s, #0\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mov z18.s, #0\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mov z19.s, #0\n" + "ld1rqb z3.b, p7/z, [a_ptr3]\n" + "mov z20.s, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "mov z21.s, #0\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "mov z22.s, #0\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "mov z23.s, #0\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov z24.s, #0\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "mov z25.s, #0\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z26.s, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z10.b, z10.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "mov z27.s, #0\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "mov z28.s, #0\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 2f\n" + "1:\n" + "ld1rw z15.s, p7/z, [%[betaptr]]\n" + "ld1w z16.s, p0/z, [%[c_ptr0]]\n" + "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n" + "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n" + "ld1w z20.s, p0/z, [c_ptr1]\n" + "mul z16.s, p7/m, z16.s, z15.s\n" + "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "mul z17.s, p7/m, z17.s, z15.s\n" + "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n" + "mul z18.s, p7/m, z18.s, z15.s\n" + "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n" + "mul z19.s, p7/m, z19.s, z15.s\n" + "ld1w z24.s, p0/z, [c_ptr2]\n" + "mul z20.s, p7/m, z20.s, z15.s\n" + "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mul z21.s, p7/m, z21.s, z15.s\n" + "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n" + "mul z22.s, p7/m, z22.s, z15.s\n" + "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n" + "mul z23.s, p7/m, z23.s, z15.s\n" + "ld1w z28.s, p0/z, [c_ptr3]\n" + "mul z24.s, p7/m, z24.s, z15.s\n" + "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "mul z25.s, p7/m, z25.s, z15.s\n" + "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n" + "mul z26.s, p7/m, z26.s, z15.s\n" + "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n" + "mul z27.s, p7/m, z27.s, z15.s\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n" + "mul z28.s, p7/m, z28.s, z15.s\n" + "ld1rqb z1.b, p7/z, [a_ptr1]\n" + "mul z29.s, p7/m, z29.s, z15.s\n" + "ld1rqb z2.b, p7/z, [a_ptr2]\n" + "mul z30.s, p7/m, z30.s, z15.s\n" + "ld1rqb z3.b, p7/z, [a_ptr3]\n" + "mul z31.s, p7/m, z31.s, z15.s\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip2 z11.b, z8.b, z9.b\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "2:\n" + "cbz %[loops], 3f\n" + "4:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "zip2 z13.b, z13.b, z14.b\n" + "add a_ptr1, a_ptr1, #0x20\n" + "zip1 z14.b, z15.b, z8.b\n" + "add a_ptr2, a_ptr2, #0x20\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "udot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "subs %[loops], %[loops], #0x1\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z28.s, z12.b, z3.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z29.s, z13.b, z3.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z30.s, z14.b, z3.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n" + "udot z31.s, z15.b, z3.b[3]\n" + "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z28.s, z8.b, z7.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z29.s, z9.b, z7.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z30.s, z10.b, z7.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "udot z31.s, z11.b, z7.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z28.s, z12.b, z7.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z29.s, z13.b, z7.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z30.s, z14.b, z7.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "udot z31.s, z15.b, z7.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z28.s, z8.b, z7.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z29.s, z9.b, z7.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z30.s, z10.b, z7.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z31.s, z11.b, z7.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z28.s, z12.b, z7.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z29.s, z13.b, z7.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z30.s, z14.b, z7.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "udot z31.s, z15.b, z7.b[3]\n" + "b.ne 4b\n" + "3:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "cbz %[regs], 5f\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z28.s, z12.b, z3.b[3]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z29.s, z13.b, z3.b[3]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z30.s, z14.b, z3.b[3]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n" + "udot z31.s, z15.b, z3.b[3]\n" + "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n" + "zip2 z15.b, z12.b, z13.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z28.s, z8.b, z7.b[0]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z29.s, z9.b, z7.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z30.s, z10.b, z7.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "udot z31.s, z11.b, z7.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z4.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z28.s, z12.b, z7.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z29.s, z13.b, z7.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z30.s, z14.b, z7.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "udot z31.s, z15.b, z7.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z28.s, z8.b, z7.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z29.s, z9.b, z7.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z30.s, z10.b, z7.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z31.s, z11.b, z7.b[2]\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z28.s, z12.b, z7.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z29.s, z13.b, z7.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z30.s, z14.b, z7.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "udot z31.s, z15.b, z7.b[3]\n" + "cbz %[blocks], 6f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "b.eq 7f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "b.eq 8f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 10f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 11f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "11:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 12f\n" + "10:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "12:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z28.s, z12.b, z3.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z29.s, z13.b, z3.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z30.s, z14.b, z3.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "udot z31.s, z15.b, z3.b[3]\n" + "b 9f\n" + "8:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 13f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 14f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "14:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 15f\n" + "13:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "15:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "b 9f\n" + "7:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 16f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 17f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "17:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 18f\n" + "16:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "18:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z0.b[1]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "b 9f\n" + "6:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 19f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 20f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "20:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 21f\n" + "19:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "21:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z0.b[0]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "b 9f\n" + "5:\n" + "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "udot z28.s, z8.b, z3.b[0]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z7.b, p6/z, [a_ptr3]\n" + "zip2 z8.b, z14.b, z12.b\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z14.b, z14.b, z12.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z21.s, z9.b, z1.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z25.s, z9.b, z2.b[0]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "udot z29.s, z9.b, z3.b[0]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "udot z26.s, z10.b, z2.b[0]\n" + "udot z30.s, z10.b, z3.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z11.b, z0.b[0]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z23.s, z11.b, z1.b[0]\n" + "udot z27.s, z11.b, z2.b[0]\n" + "udot z31.s, z11.b, z3.b[0]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z12.b, z0.b[1]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "udot z20.s, z12.b, z1.b[1]\n" + "udot z24.s, z12.b, z2.b[1]\n" + "udot z28.s, z12.b, z3.b[1]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "udot z17.s, z13.b, z0.b[1]\n" + "udot z21.s, z13.b, z1.b[1]\n" + "udot z25.s, z13.b, z2.b[1]\n" + "udot z29.s, z13.b, z3.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "udot z18.s, z14.b, z0.b[1]\n" + "udot z22.s, z14.b, z1.b[1]\n" + "udot z26.s, z14.b, z2.b[1]\n" + "udot z30.s, z14.b, z3.b[1]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "udot z19.s, z15.b, z0.b[1]\n" + "udot z23.s, z15.b, z1.b[1]\n" + "udot z27.s, z15.b, z2.b[1]\n" + "udot z31.s, z15.b, z3.b[1]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z16.s, z8.b, z0.b[2]\n" + "udot z20.s, z8.b, z1.b[2]\n" + "udot z24.s, z8.b, z2.b[2]\n" + "udot z28.s, z8.b, z3.b[2]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "udot z17.s, z9.b, z0.b[2]\n" + "udot z21.s, z9.b, z1.b[2]\n" + "udot z25.s, z9.b, z2.b[2]\n" + "udot z29.s, z9.b, z3.b[2]\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z18.s, z10.b, z0.b[2]\n" + "udot z22.s, z10.b, z1.b[2]\n" + "udot z26.s, z10.b, z2.b[2]\n" + "udot z30.s, z10.b, z3.b[2]\n" + "udot z19.s, z11.b, z0.b[2]\n" + "udot z23.s, z11.b, z1.b[2]\n" + "udot z27.s, z11.b, z2.b[2]\n" + "udot z31.s, z11.b, z3.b[2]\n" + "udot z16.s, z12.b, z0.b[3]\n" + "udot z20.s, z12.b, z1.b[3]\n" + "udot z24.s, z12.b, z2.b[3]\n" + "udot z28.s, z12.b, z3.b[3]\n" + "udot z17.s, z13.b, z0.b[3]\n" + "udot z21.s, z13.b, z1.b[3]\n" + "udot z25.s, z13.b, z2.b[3]\n" + "udot z29.s, z13.b, z3.b[3]\n" + "udot z18.s, z14.b, z0.b[3]\n" + "udot z22.s, z14.b, z1.b[3]\n" + "udot z26.s, z14.b, z2.b[3]\n" + "udot z30.s, z14.b, z3.b[3]\n" + "udot z19.s, z15.b, z0.b[3]\n" + "udot z23.s, z15.b, z1.b[3]\n" + "udot z27.s, z15.b, z2.b[3]\n" + "udot z31.s, z15.b, z3.b[3]\n" + "cbz %[blocks], 22f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z28.s, z8.b, z7.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z29.s, z9.b, z7.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z30.s, z10.b, z7.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "udot z31.s, z11.b, z7.b[0]\n" + "b.eq 23f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z28.s, z12.b, z7.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z29.s, z13.b, z7.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z30.s, z14.b, z7.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "udot z31.s, z15.b, z7.b[1]\n" + "b.eq 24f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "ld1b z8.b, p4/z, [%[b_ptr3]]\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z28.s, z8.b, z7.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z29.s, z9.b, z7.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z30.s, z10.b, z7.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z31.s, z11.b, z7.b[2]\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 25f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 26f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "26:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 27f\n" + "25:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "27:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[3]\n" + "udot z20.s, z12.b, z5.b[3]\n" + "udot z24.s, z12.b, z6.b[3]\n" + "udot z28.s, z12.b, z7.b[3]\n" + "udot z17.s, z13.b, z4.b[3]\n" + "udot z21.s, z13.b, z5.b[3]\n" + "udot z25.s, z13.b, z6.b[3]\n" + "udot z29.s, z13.b, z7.b[3]\n" + "udot z18.s, z14.b, z4.b[3]\n" + "udot z22.s, z14.b, z5.b[3]\n" + "udot z26.s, z14.b, z6.b[3]\n" + "udot z30.s, z14.b, z7.b[3]\n" + "udot z19.s, z15.b, z4.b[3]\n" + "udot z23.s, z15.b, z5.b[3]\n" + "udot z27.s, z15.b, z6.b[3]\n" + "udot z31.s, z15.b, z7.b[3]\n" + "b 9f\n" + "24:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 28f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 29f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "29:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 30f\n" + "28:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "30:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[2]\n" + "udot z20.s, z8.b, z5.b[2]\n" + "udot z24.s, z8.b, z6.b[2]\n" + "udot z28.s, z8.b, z7.b[2]\n" + "udot z17.s, z9.b, z4.b[2]\n" + "udot z21.s, z9.b, z5.b[2]\n" + "udot z25.s, z9.b, z6.b[2]\n" + "udot z29.s, z9.b, z7.b[2]\n" + "udot z18.s, z10.b, z4.b[2]\n" + "udot z22.s, z10.b, z5.b[2]\n" + "udot z26.s, z10.b, z6.b[2]\n" + "udot z30.s, z10.b, z7.b[2]\n" + "udot z19.s, z11.b, z4.b[2]\n" + "udot z23.s, z11.b, z5.b[2]\n" + "udot z27.s, z11.b, z6.b[2]\n" + "udot z31.s, z11.b, z7.b[2]\n" + "b 9f\n" + "23:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 31f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 32f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "32:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "ld1b z14.b, p4/z, [%[b_ptr1]]\n" + "b 33f\n" + "31:\n" + "mov z13.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z14.b, #0\n" + "ld1b z12.b, p4/z, [%[b_ptr0]]\n" + "33:\n" + "zip2 z15.b, z12.b, z13.b\n" + "zip1 z13.b, z12.b, z13.b\n" + "mov z12.b, #0\n" + "zip2 z8.b, z14.b, z12.b\n" + "zip1 z14.b, z14.b, z12.b\n" + "zip1 z12.b, z13.b, z14.b\n" + "zip2 z13.b, z13.b, z14.b\n" + "zip1 z14.b, z15.b, z8.b\n" + "zip2 z15.b, z15.b, z8.b\n" + "udot z16.s, z12.b, z4.b[1]\n" + "udot z20.s, z12.b, z5.b[1]\n" + "udot z24.s, z12.b, z6.b[1]\n" + "udot z28.s, z12.b, z7.b[1]\n" + "udot z17.s, z13.b, z4.b[1]\n" + "udot z21.s, z13.b, z5.b[1]\n" + "udot z25.s, z13.b, z6.b[1]\n" + "udot z29.s, z13.b, z7.b[1]\n" + "udot z18.s, z14.b, z4.b[1]\n" + "udot z22.s, z14.b, z5.b[1]\n" + "udot z26.s, z14.b, z6.b[1]\n" + "udot z30.s, z14.b, z7.b[1]\n" + "udot z19.s, z15.b, z4.b[1]\n" + "udot z23.s, z15.b, z5.b[1]\n" + "udot z27.s, z15.b, z6.b[1]\n" + "udot z31.s, z15.b, z7.b[1]\n" + "b 9f\n" + "22:\n" + "cbz %[odds], 9f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 34f\n" + "subs %[odds], %[odds], #0x1\n" + "b.eq 35f\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "35:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" + "b 36f\n" + "34:\n" + "mov z9.b, #0\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "mov z10.b, #0\n" + "ld1b z8.b, p4/z, [%[b_ptr0]]\n" + "36:\n" + "zip2 z11.b, z8.b, z9.b\n" + "zip1 z9.b, z8.b, z9.b\n" + "mov z8.b, #0\n" + "zip2 z12.b, z10.b, z8.b\n" + "zip1 z10.b, z10.b, z8.b\n" + "zip1 z8.b, z9.b, z10.b\n" + "zip2 z9.b, z9.b, z10.b\n" + "zip1 z10.b, z11.b, z12.b\n" + "zip2 z11.b, z11.b, z12.b\n" + "udot z16.s, z8.b, z4.b[0]\n" + "udot z20.s, z8.b, z5.b[0]\n" + "udot z24.s, z8.b, z6.b[0]\n" + "udot z28.s, z8.b, z7.b[0]\n" + "udot z17.s, z9.b, z4.b[0]\n" + "udot z21.s, z9.b, z5.b[0]\n" + "udot z25.s, z9.b, z6.b[0]\n" + "udot z29.s, z9.b, z7.b[0]\n" + "udot z18.s, z10.b, z4.b[0]\n" + "udot z22.s, z10.b, z5.b[0]\n" + "udot z26.s, z10.b, z6.b[0]\n" + "udot z30.s, z10.b, z7.b[0]\n" + "udot z19.s, z11.b, z4.b[0]\n" + "udot z23.s, z11.b, z5.b[0]\n" + "udot z27.s, z11.b, z6.b[0]\n" + "udot z31.s, z11.b, z7.b[0]\n" + "9:\n" + "st1w z16.s, p0, [%[c_ptr0]]\n" + "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n" + "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #4\n" + "st1w z20.s, p0, [c_ptr1]\n" + "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n" + "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n" + "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n" + "st1w z24.s, p0, [c_ptr2]\n" + "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n" + "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n" + "st1w z28.s, p0, [c_ptr3]\n" + "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n" + "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n" + "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp new file mode 100644 index 0000000000..80b216ca14 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_smallK_fp32_mla_1VLx4(const float *, int, const float *, int ldb, float *, int, float, int, int, int); + +class smallK_fp32_mla_1VLx4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 1; + } + + static int k_unroll() + { + return 1; + } + + + + // Default to the generic kernel + kern_type kernel=sve_smallK_fp32_mla_1VLx4; + + smallK_fp32_mla_1VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp new file mode 100644 index 0000000000..e2cc1d14e2 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4/generic.cpp @@ -0,0 +1,4264 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_smallK_fp32_mla_1VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) { + const long beta0 = (beta == 0.0f); + + const long loops_count = M / 4; + const long oddrow_count = M % 4; + const long ldab = lda * sizeof(float); + const long ldcb = ldc * sizeof(float); + const long odd_depth = K % 4; + const float *betaptr = β + long ldbb = ldb * sizeof(float); + + for (int x0=0; x0() * 1)) { + const long width = std::min((unsigned long)N-x0, (get_vector_length() * 1)); + long loops = loops_count; + long oddrows = oddrow_count; + long temp = 0; + const float *b_ptr0 = B + x0; + + const float *a_ptr0 = A; + + float *c_ptr0 = C + x0; + + switch(K) { + case 1: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 5: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 6: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 7: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 8: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 9: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 10: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 11: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 12: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 13: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 14: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 15: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 16: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 17: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 18: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 19: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 20: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 21: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z24.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 22: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z24.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z25.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 23: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z24.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z25.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z26.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "fmla z28.s, z26.s, z0.s[2]\n" + "fmla z29.s, z26.s, z1.s[2]\n" + "fmla z30.s, z26.s, z2.s[2]\n" + "fmla z31.s, z26.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "fmla z28.s, z26.s, z1.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + default: + case 24: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z5.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z6.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z7.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z9.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z10.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z13.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z14.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z16.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z17.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z18.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z19.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z20.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z21.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z22.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z23.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z24.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z25.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z26.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z27.s, p0/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "fmla z28.s, z26.s, z0.s[2]\n" + "fmla z29.s, z26.s, z1.s[2]\n" + "fmla z30.s, z26.s, z2.s[2]\n" + "fmla z31.s, z26.s, z3.s[2]\n" + "fmla z28.s, z27.s, z0.s[3]\n" + "fmla z29.s, z27.s, z1.s[3]\n" + "fmla z30.s, z27.s, z2.s[3]\n" + "fmla z31.s, z27.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "fmla z28.s, z26.s, z1.s[2]\n" + "fmla z28.s, z27.s, z1.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp new file mode 100644 index 0000000000..aa2c522382 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_smallK_hybrid_fp32_mla_1VLx4(const float *, int, const float *, float *, int, float, int, int, int); + +class smallK_hybrid_fp32_mla_1VLx4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int); + + /* Kernel blocking parameters */ + static int out_height() + { + return 4; + } + + static int out_width() + { + return get_vector_length() * 1; + } + + static int k_unroll() + { + return 1; + } + + StdTransformsSVE transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx4; + + smallK_hybrid_fp32_mla_1VLx4(const CPUInfo *ci) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp new file mode 100644 index 0000000000..3e7e713106 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4/generic.cpp @@ -0,0 +1,4004 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include + + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_smallK_hybrid_fp32_mla_1VLx4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) { + const long beta0 = (beta == 0.0f); + + const long loops_count = M / 4; + const long oddrow_count = M % 4; + const long ldab = lda * sizeof(float); + const long ldcb = ldc * sizeof(float); + const int K_stride = K; + const long odd_depth = K % 4; + const float *betaptr = β + + for (int x0=0; x0() * 1)) { + const long width = std::min((unsigned long)N-x0, (get_vector_length() * 1)); + long loops = loops_count; + long oddrows = oddrow_count; + long temp = 0; + const float *b_ptr0 = B + (K_stride * x0); + + const float *a_ptr0 = A; + + float *c_ptr0 = C + x0; + + switch(K) { + case 1: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p6/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p6/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p6/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 5: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 6: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 7: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 8: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 9: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 10: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 11: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p6/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 12: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 13: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 14: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 15: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p6/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 16: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 17: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 18: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 19: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 20: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 21: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 22: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 23: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "fmla z28.s, z26.s, z0.s[2]\n" + "fmla z29.s, z26.s, z1.s[2]\n" + "fmla z30.s, z26.s, z2.s[2]\n" + "fmla z31.s, z26.s, z3.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p6/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "fmla z28.s, z26.s, z1.s[2]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + default: + case 24: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[odd_depth]\n" + "whilelt p0.s, %[temp], %[width]\n" + "ptrue p7.s\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "ld1w z4.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "ld1w z5.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "ld1w z6.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z7.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z8.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z9.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z10.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z11.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "ld1w z16.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + "ld1w z17.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + "ld1w z18.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "ld1w z19.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "ld1w z20.s, p7/z, [%[b_ptr0]]\n" + "ld1w z21.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z22.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z23.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z24.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z25.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z26.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z27.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "cbz %[loops], 1f\n" + "2:\n" + "cbz %[beta0], 3f\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "b 4f\n" + "3:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "ld1w z29.s, p0/z, [c_ptr1]\n" + "ld1w z30.s, p0/z, [c_ptr2]\n" + "ld1w z31.s, p0/z, [c_ptr3]\n" + "4:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[loops], %[loops], #0x1\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z29.s, z4.s, z1.s[0]\n" + "fmla z30.s, z4.s, z2.s[0]\n" + "fmla z31.s, z4.s, z3.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z29.s, z5.s, z1.s[1]\n" + "fmla z30.s, z5.s, z2.s[1]\n" + "fmla z31.s, z5.s, z3.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z29.s, z6.s, z1.s[2]\n" + "fmla z30.s, z6.s, z2.s[2]\n" + "fmla z31.s, z6.s, z3.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x10]\n" + "fmla z29.s, z7.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x10]\n" + "fmla z30.s, z7.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x10]\n" + "fmla z31.s, z7.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x10]\n" + "fmla z28.s, z8.s, z0.s[0]\n" + "fmla z29.s, z8.s, z1.s[0]\n" + "fmla z30.s, z8.s, z2.s[0]\n" + "fmla z31.s, z8.s, z3.s[0]\n" + "fmla z28.s, z9.s, z0.s[1]\n" + "fmla z29.s, z9.s, z1.s[1]\n" + "fmla z30.s, z9.s, z2.s[1]\n" + "fmla z31.s, z9.s, z3.s[1]\n" + "fmla z28.s, z10.s, z0.s[2]\n" + "fmla z29.s, z10.s, z1.s[2]\n" + "fmla z30.s, z10.s, z2.s[2]\n" + "fmla z31.s, z10.s, z3.s[2]\n" + "fmla z28.s, z11.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x20]\n" + "fmla z29.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x20]\n" + "fmla z30.s, z11.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x20]\n" + "fmla z31.s, z11.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x20]\n" + "fmla z28.s, z12.s, z0.s[0]\n" + "fmla z29.s, z12.s, z1.s[0]\n" + "fmla z30.s, z12.s, z2.s[0]\n" + "fmla z31.s, z12.s, z3.s[0]\n" + "fmla z28.s, z13.s, z0.s[1]\n" + "fmla z29.s, z13.s, z1.s[1]\n" + "fmla z30.s, z13.s, z2.s[1]\n" + "fmla z31.s, z13.s, z3.s[1]\n" + "fmla z28.s, z14.s, z0.s[2]\n" + "fmla z29.s, z14.s, z1.s[2]\n" + "fmla z30.s, z14.s, z2.s[2]\n" + "fmla z31.s, z14.s, z3.s[2]\n" + "fmla z28.s, z15.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z29.s, z15.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x30]\n" + "fmla z30.s, z15.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x30]\n" + "fmla z31.s, z15.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x30]\n" + "fmla z28.s, z16.s, z0.s[0]\n" + "fmla z29.s, z16.s, z1.s[0]\n" + "fmla z30.s, z16.s, z2.s[0]\n" + "fmla z31.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z0.s[1]\n" + "fmla z29.s, z17.s, z1.s[1]\n" + "fmla z30.s, z17.s, z2.s[1]\n" + "fmla z31.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z0.s[2]\n" + "fmla z29.s, z18.s, z1.s[2]\n" + "fmla z30.s, z18.s, z2.s[2]\n" + "fmla z31.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z29.s, z19.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x40]\n" + "fmla z30.s, z19.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x40]\n" + "fmla z31.s, z19.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x40]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z29.s, z20.s, z1.s[0]\n" + "fmla z30.s, z20.s, z2.s[0]\n" + "fmla z31.s, z20.s, z3.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z29.s, z21.s, z1.s[1]\n" + "fmla z30.s, z21.s, z2.s[1]\n" + "fmla z31.s, z21.s, z3.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z29.s, z22.s, z1.s[2]\n" + "fmla z30.s, z22.s, z2.s[2]\n" + "fmla z31.s, z22.s, z3.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda], LSL #2\n" + "fmla z29.s, z23.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #0x50]\n" + "fmla z30.s, z23.s, z2.s[3]\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #0x50]\n" + "fmla z31.s, z23.s, z3.s[3]\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #0x50]\n" + "fmla z28.s, z24.s, z0.s[0]\n" + "add a_ptr1, a_ptr1, %[lda], LSL #2\n" + "fmla z29.s, z24.s, z1.s[0]\n" + "add a_ptr2, a_ptr2, %[lda], LSL #2\n" + "fmla z30.s, z24.s, z2.s[0]\n" + "add a_ptr3, a_ptr3, %[lda], LSL #2\n" + "fmla z31.s, z24.s, z3.s[0]\n" + "fmla z28.s, z25.s, z0.s[1]\n" + "fmla z29.s, z25.s, z1.s[1]\n" + "fmla z30.s, z25.s, z2.s[1]\n" + "fmla z31.s, z25.s, z3.s[1]\n" + "fmla z28.s, z26.s, z0.s[2]\n" + "fmla z29.s, z26.s, z1.s[2]\n" + "fmla z30.s, z26.s, z2.s[2]\n" + "fmla z31.s, z26.s, z3.s[2]\n" + "fmla z28.s, z27.s, z0.s[3]\n" + "fmla z29.s, z27.s, z1.s[3]\n" + "fmla z30.s, z27.s, z2.s[3]\n" + "fmla z31.s, z27.s, z3.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc], LSL #2\n" + "st1w z29.s, p0, [c_ptr1]\n" + "add c_ptr1, c_ptr1, %[ldc], LSL #2\n" + "st1w z30.s, p0, [c_ptr2]\n" + "add c_ptr2, c_ptr2, %[ldc], LSL #2\n" + "st1w z31.s, p0, [c_ptr3]\n" + "add c_ptr3, c_ptr3, %[ldc], LSL #2\n" + "b.ne 2b\n" + "1:\n" + "cbz %[oddrows], 5f\n" + "6:\n" + "cbz %[beta0], 7f\n" + "mov z28.s, #0\n" + "b 8f\n" + "7:\n" + "ld1w z28.s, p0/z, [%[c_ptr0]]\n" + "8:\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "subs %[oddrows], %[oddrows], #0x1\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x10]\n" + "ld1rqw z2.s, p7/z, [%[a_ptr0], #0x20]\n" + "ld1rqw z3.s, p7/z, [%[a_ptr0], #0x30]\n" + "fmla z28.s, z4.s, z0.s[0]\n" + "fmla z28.s, z5.s, z0.s[1]\n" + "fmla z28.s, z6.s, z0.s[2]\n" + "fmla z28.s, z7.s, z0.s[3]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #0x40]\n" + "fmla z28.s, z8.s, z1.s[0]\n" + "fmla z28.s, z9.s, z1.s[1]\n" + "fmla z28.s, z10.s, z1.s[2]\n" + "fmla z28.s, z11.s, z1.s[3]\n" + "ld1rqw z1.s, p7/z, [%[a_ptr0], #0x50]\n" + "add %[a_ptr0], %[a_ptr0], %[lda]\n" + "fmla z28.s, z12.s, z2.s[0]\n" + "fmla z28.s, z13.s, z2.s[1]\n" + "fmla z28.s, z14.s, z2.s[2]\n" + "fmla z28.s, z15.s, z2.s[3]\n" + "fmla z28.s, z16.s, z3.s[0]\n" + "fmla z28.s, z17.s, z3.s[1]\n" + "fmla z28.s, z18.s, z3.s[2]\n" + "fmla z28.s, z19.s, z3.s[3]\n" + "fmla z28.s, z20.s, z0.s[0]\n" + "fmla z28.s, z21.s, z0.s[1]\n" + "fmla z28.s, z22.s, z0.s[2]\n" + "fmla z28.s, z23.s, z0.s[3]\n" + "fmla z28.s, z24.s, z1.s[0]\n" + "fmla z28.s, z25.s, z1.s[1]\n" + "fmla z28.s, z26.s, z1.s[2]\n" + "fmla z28.s, z27.s, z1.s[3]\n" + "st1w z28.s, p0, [%[c_ptr0]]\n" + "add %[c_ptr0], %[c_ptr0], %[ldc]\n" + "b.ne 6b\n" + "5:\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [temp] "+r" (temp), [oddrows] "+r" (oddrows) + : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [odd_depth] "r" (odd_depth), [lda] "r" (ldab), [ldc] "r" (ldcb) + : "x0", "x1", "x2", "x3", "x4", "x5", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE -- cgit v1.2.1