diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels')
87 files changed, 8768 insertions, 835 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp index 8700c42f5d..0f0e5a7ed4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -65,7 +65,7 @@ public: kern_type kernel = a64_gemm_s16_asimd_12x8; - gemm_s16_12x8(const CPUInfo *ci) { UNUSED(ci); } + gemm_s16_12x8(const CPUInfo *) { } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp index cc6c583b33..e5b295b640 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp @@ -34,6 +34,7 @@ namespace arm_gemm { // Load the actual kernel void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int); void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int); +void a64_gemm_s8_12x8_x1(const int8_t *, const int8_t *, int32_t *, int, int, int); class gemm_s8_12x8 { public: @@ -65,6 +66,8 @@ public: if (mod == CPUModel::A55r1) { kernel = a64_gemm_s8_12x8_a55r1; + } else if (mod == CPUModel::X1) { + kernel = a64_gemm_s8_12x8_x1; } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp new file mode 100644 index 0000000000..446fcf8707 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include <arm_neon.h> + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_s8_12x8_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + // We divide K by 4 because the sdot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb<ablocks; yb++) { + const int8_t *a_ptr0 = a_ptr; + const int8_t *b_ptr = Bpanel; + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + int k = init_value_k; + register uint8x16_t a0 asm("v0"); + register uint8x16_t a1 asm("v1"); + register uint8x16_t b0 asm("v2"); + register uint8x16_t b1 asm("v3"); + register uint8x16_t b2 asm("v4"); + + __asm __volatile ( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v13.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v14.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v17.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v19.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Loop proper + "1:\n" + ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[a_ptr], #320]") + ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %q[a0], [%[a_ptr], #32]\n" + ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[a1], [%[a_ptr], #48]\n" + + ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[b_ptr], #512]") + ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "subs %w[k], %w[k], #1\n" + ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #112]\n" + + ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %q[a0], [%[a_ptr]]\n" + ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %q[a0], [%[a_ptr], #-32]\n" + ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[a1], [%[a_ptr], #-16]\n" + + ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "str q24, [%[c_ptr], #32]\n" + + ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + + ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a1] "+w" (a1), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); + + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp index fb21bfc863..256acc4c65 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -59,7 +59,7 @@ public: kern_type kernel=a64_gemm_s8_4x4; - gemm_s8_4x4(const CPUInfo *ci) { UNUSED(ci); } + gemm_s8_4x4(const CPUInfo *) { } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp index 971b027c44..b86204043c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -65,7 +65,7 @@ public: kern_type kernel = a64_gemm_u16_asimd_12x8; - gemm_u16_12x8(const CPUInfo *ci) { UNUSED(ci); } + gemm_u16_12x8(const CPUInfo *) { } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp index a67e2d6c84..52ce5d26d9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp @@ -32,6 +32,7 @@ namespace arm_gemm { // Load the actual kernel void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); +void a64_gemm_u8_12x8_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); class gemm_u8_12x8 { public: @@ -73,6 +74,8 @@ public: if (mod == CPUModel::A55r1) { kernel = a64_gemm_u8_12x8_a55r1; + } else if (mod == CPUModel::X1) { + kernel = a64_gemm_u8_12x8_x1; } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp new file mode 100644 index 0000000000..7fac67354f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include <arm_neon.h> + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_gemm_u8_12x8_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + // We divide K by 4 because the udot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb<ablocks; yb++) { + const uint8_t *a_ptr0 = a_ptr; + const uint8_t *b_ptr = Bpanel; + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + int k = init_value_k; + register uint8x16_t a0 asm("v0"); + register uint8x16_t a1 asm("v1"); + register uint8x16_t b0 asm("v2"); + register uint8x16_t b1 asm("v3"); + register uint8x16_t b2 asm("v4"); + + __asm __volatile ( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v13.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v14.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v17.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v19.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Loop proper + "1:\n" + ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[a_ptr], #320]") + ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %q[a0], [%[a_ptr], #32]\n" + ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[a1], [%[a_ptr], #48]\n" + + ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[b_ptr], #512]") + ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "subs %w[k], %w[k], #1\n" + ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #112]\n" + + ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %q[a0], [%[a_ptr]]\n" + ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %q[a0], [%[a_ptr], #-32]\n" + ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[a1], [%[a_ptr], #-16]\n" + + ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "str q24, [%[c_ptr], #32]\n" + + ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + + ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a1] "+w" (a1), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); + + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp index 8bde3a6943..134007b74c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -67,9 +67,7 @@ public: kern_type kernel = a64_gemm_u8_4x4; - gemm_u8_4x4(const CPUInfo *ci) { - UNUSED(ci); - } + gemm_u8_4x4(const CPUInfo *) { } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp index 8e17aa6663..79cae6002a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,6 +32,7 @@ namespace arm_gemm { // Actual kernel implementations void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); +void a64_hgemm_asimd_24x8_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); // 24x8 HGEMM "strategy" class. Describes the kernel properties. // @@ -68,6 +69,8 @@ public: if (model == CPUModel::A55r1) { kernel = a64_hgemm_asimd_24x8_a55r1; + } else if (model == CPUModel::X1) { + kernel = a64_hgemm_asimd_24x8_x1; } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp new file mode 100644 index 0000000000..3bb8334126 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported. +#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) + +#include <arm_neon.h> + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 24x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_hgemm_asimd_24x8_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + + for (int yb=0; yb<ablocks; yb++) { + const __fp16 *a_ptr0 = a_ptr; + const __fp16 *b_ptr = Bpanel; + + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K+1)/2) - 1; + + register float16x8_t a0 asm("v0"); + register float16x8_t a0a asm("v1"); + register float16x8_t b0 asm("v2"); + register float16x8_t b1 asm("v3"); + register float16x8_t b2 asm("v4"); + + __asm __volatile ( + // Enable FP16 instruction support (but only if it's not already on). +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.8h, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.8h, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.8h, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v11.8h, #0x0\n" + "movi v12.8h, #0x0\n" + "movi v13.8h, #0x0\n" + "movi v14.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v15.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v16.8h, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v17.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v18.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v19.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.8h, #0x0\n" + "movi v21.8h, #0x0\n" + "movi v22.8h, #0x0\n" + "movi v23.8h, #0x0\n" + "movi v24.8h, #0x0\n" + "movi v25.8h, #0x0\n" + "movi v26.8h, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v28.8h, #0x0\n" + "movi v29.8h, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v31.8h, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" + "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" + "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" + "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" + "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" + "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" + "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" + "ldr %q[b1], [%[b_ptr], #-32]\n" + + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + ASM_PREFETCH("[%[b_ptr], #288]") + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" + "ldr %q[a0a], [%[a_ptr], #16]\n" + "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" + "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" + "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" + "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" + "ldr %q[b2], [%[b_ptr], #-16]\n" + + "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n" + "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n" + "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n" + "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n" + "fmla v12.8h, %[b0].8h, %[a0a].h[4]\n" + "fmla v13.8h, %[b0].8h, %[a0a].h[5]\n" + "fmla v14.8h, %[b0].8h, %[a0a].h[6]\n" + "fmla v15.8h, %[b0].8h, %[a0a].h[7]\n" + "ldr %q[b0], [%[b_ptr]]\n" + + "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n" + "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n" + ASM_PREFETCH("[%[b_ptr], #352]") + "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n" + "fmla v20.8h, %[b1].8h, %[a0a].h[4]\n" + "fmla v21.8h, %[b1].8h, %[a0a].h[5]\n" + "fmla v22.8h, %[b1].8h, %[a0a].h[6]\n" + "fmla v23.8h, %[b1].8h, %[a0a].h[7]\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + + "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n" + "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n" + "ldr %q[a0], [%[a_ptr]]\n" + "fmla v28.8h, %[b2].8h, %[a0a].h[4]\n" + "fmla v29.8h, %[b2].8h, %[a0a].h[5]\n" + "subs %w[k], %w[k], #1\n" + "fmla v30.8h, %[b2].8h, %[a0a].h[6]\n" + "fmla v31.8h, %[b2].8h, %[a0a].h[7]\n" + + "bne 1b\n" + "4:\n" + + // Jump to odd tail if necessary. + "cbnz %w[oddk], 2f\n" + + // Even tail. + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" + "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" + "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" + "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" + "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" + "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" + "ldr %q[b1], [%[b_ptr], #-32]\n" + + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" + "ldr %q[a0a], [%[a_ptr], #-16]\n" + "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" + "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" + "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" + "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" + "ldr %q[b2], [%[b_ptr], #-16]\n" + + "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n" + "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n" + "str q8, [%[c_ptr]]\n" + "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n" + "str q16, [%[c_ptr], #16]\n" + + "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n" + "str q24, [%[c_ptr], #32]\n" + "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n" + "str q9, [%[c_ptr], #48]\n" + "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n" + "str q17, [%[c_ptr], #64]\n" + + "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n" + "str q10, [%[c_ptr], #96]\n" + "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n" + "str q18, [%[c_ptr], #112]\n" + + "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n" + "str q11, [%[c_ptr], #144]\n" + "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n" + "str q19, [%[c_ptr], #160]\n" + + "fmla v12.8h, %[b0].8h, %[a0a].h[4]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v20.8h, %[b1].8h, %[a0a].h[4]\n" + "str q12, [%[c_ptr], #192]\n" + "fmla v28.8h, %[b2].8h, %[a0a].h[4]\n" + "str q20, [%[c_ptr], #208]\n" + + "fmla v13.8h, %[b0].8h, %[a0a].h[5]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v21.8h, %[b1].8h, %[a0a].h[5]\n" + "str q13, [%[c_ptr], #240]\n" + "fmla v29.8h, %[b2].8h, %[a0a].h[5]\n" + "str q21, [%[c_ptr], #256]\n" + + "fmla v14.8h, %[b0].8h, %[a0a].h[6]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v22.8h, %[b1].8h, %[a0a].h[6]\n" + "str q14, [%[c_ptr], #288]\n" + "fmla v30.8h, %[b2].8h, %[a0a].h[6]\n" + "str q22, [%[c_ptr], #304]\n" + + "fmla v15.8h, %[b0].8h, %[a0a].h[7]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v23.8h, %[b1].8h, %[a0a].h[7]\n" + "str q15, [%[c_ptr], #336]\n" + "fmla v31.8h, %[b2].8h, %[a0a].h[7]\n" + "b 3f\n" + + // Odd tail + "2:\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "add %[a_ptr], %[a_ptr], #16\n" + "str q8, [%[c_ptr]]\n" + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "str q16, [%[c_ptr], #16]\n" + + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "str q24, [%[c_ptr], #32]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" + "str q9, [%[c_ptr], #48]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + "str q17, [%[c_ptr], #64]\n" + + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "str q10, [%[c_ptr], #96]\n" + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "str q18, [%[c_ptr], #112]\n" + + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "str q11, [%[c_ptr], #144]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" + "str q19, [%[c_ptr], #160]\n" + + "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" + "str q12, [%[c_ptr], #192]\n" + "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" + "str q20, [%[c_ptr], #208]\n" + + "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" + "str q13, [%[c_ptr], #240]\n" + "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" + "str q21, [%[c_ptr], #256]\n" + + "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" + "str q14, [%[c_ptr], #288]\n" + "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" + "str q22, [%[c_ptr], #304]\n" + + "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" + "str q15, [%[c_ptr], #336]\n" + "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" + + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a0a] "+w" (a0a), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp index 1ce934d413..5e5b6bd4c8 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,6 +34,7 @@ namespace arm_gemm // Actual kernel implementations void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); +void a64_hybrid_fp32_mla_16x4_x1(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); class hybrid_fp32_mla_16x4 { @@ -83,6 +84,8 @@ public: { if (ci->get_cpu_model() == CPUModel::A55r1) { kernel = a64_hybrid_fp32_mla_16x4_a55; + } else if (ci->get_cpu_model() == CPUModel::X1) { + kernel = a64_hybrid_fp32_mla_16x4_x1; } } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp index 5bce632bc4..1b828ee503 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,12 +61,23 @@ void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const float * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(float); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=16ul) { const long width = std::min((unsigned long)N-x0, 16ul); long loops = loops_count; @@ -90,7 +101,7 @@ void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float } const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "temploadreg0 .req X0\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp index 03f65889ea..43ff3a98dc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,12 +61,23 @@ void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const float * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(float); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=16ul) { const long width = std::min((unsigned long)N-x0, 16ul); long loops = loops_count; @@ -90,7 +101,7 @@ void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, } const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "cbnz %[append], 1f\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp new file mode 100644 index 0000000000..f4fba227d6 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp @@ -0,0 +1,1810 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include <algorithm> + +#include "arm_gemm.hpp" + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) { + const int K_stride = K; + const long loops_count = ((K + 4) / 8) - 1; + K -= loops_count * 8; + const long regs_count = (K / 4) - 1; + K -= (regs_count + 1) * 4; + const long blocks_count = K / 1; + float nullbias[16]; + if (!append && !bias) { + memset(nullbias, 0, (16 * sizeof(float))); + } + float minval = - static_cast<float>(std::numeric_limits<float>::infinity()); + float maxval = static_cast<float>(std::numeric_limits<float>::infinity()); + const float * const minptr = &minval; + const float * const maxptr = &maxval; + + switch(act.type) + { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + maxval = static_cast<float>(act.param1); + /* fall through */ + case Activation::Type::ReLU: + minval = 0.0f; + break; + } + + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { + const float * const a_ptr0_base = A + (y * lda); + const unsigned long ldab = lda * sizeof(float); + + float *c_ptr0 = C + (y * ldc); + + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + + for (int x0=0; x0<N; x0+=16ul) { + const long width = std::min((unsigned long)N-x0, 16ul); + long loops = loops_count; + long regs = regs_count; + long blocks = blocks_count; + const float *a_ptr0 = a_ptr0_base; + const float *b_ptr0 = B + (K_stride * x0); + const bool use_result_buffer = (width < 16); + float result_buffer[64]; + const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float); + float *c_ptr_real = c_ptr0; + if (use_result_buffer && append) { + for(int cy=0; cy<std::min(M-y, 4); cy++) { + for(unsigned int cx=0; cx<width; cx++) { + result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx]; + } + } + } + if (use_result_buffer) { + c_ptr0 = result_buffer; + } + const float *biasptr = bias ? bias+x0 : nullbias; + + switch(rows_to_compute) { + case 1: + __asm __volatile ( + "cbnz %[append], 1f\n" + "ldr q16, [%[biasptr]]\n" + "ldr q17, [%[biasptr], #0x10]\n" + "ldr q18, [%[biasptr], #0x20]\n" + "ldr q19, [%[biasptr], #0x30]\n" + "ldr q0, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "ldr q0, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "3:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "cbz %[regs], 4f\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "b 5f\n" + "4:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "5:\n" + "cbz %[blocks], 6f\n" + "7:\n" + "ldr q8, [%[b_ptr0]]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr s0, [%[a_ptr0]]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x4\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "b.ne 7b\n" + "6:\n" + "ld1r {v14.4s}, [%[minptr]]\n" + "ld1r {v15.4s}, [%[maxptr]]\n" + "fmax v16.4s, v16.4s, v14.4s\n" + "fmax v17.4s, v17.4s, v14.4s\n" + "fmax v18.4s, v18.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v14.4s\n" + "fmin v16.4s, v16.4s, v15.4s\n" + "fmin v17.4s, v17.4s, v15.4s\n" + "fmin v18.4s, v18.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v15.4s\n" + "str q16, [%[c_ptr0]]\n" + "str q17, [%[c_ptr0], #0x10]\n" + "str q18, [%[c_ptr0], #0x20]\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "cbnz %[append], 1f\n" + "ldr q16, [%[biasptr]]\n" + "ldr q17, [%[biasptr], #0x10]\n" + "ldr q18, [%[biasptr], #0x20]\n" + "ldr q19, [%[biasptr], #0x30]\n" + "mov v20.16b, v16.16b\n" + "ldr q0, [%[a_ptr0]]\n" + "mov v21.16b, v17.16b\n" + "ldr q1, [a_ptr1]\n" + "mov v22.16b, v18.16b\n" + "ldr q8, [%[b_ptr0]]\n" + "mov v23.16b, v19.16b\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "ldr q20, [c_ptr1]\n" + "ldr q21, [c_ptr1, #0x10]\n" + "ldr q22, [c_ptr1, #0x20]\n" + "ldr q23, [c_ptr1, #0x30]\n" + "ldr q0, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q1, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "3:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "cbz %[regs], 4f\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "b 5f\n" + "4:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "5:\n" + "cbz %[blocks], 6f\n" + "7:\n" + "ldr q8, [%[b_ptr0]]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr s0, [%[a_ptr0]]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x4\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr s1, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x4\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "b.ne 7b\n" + "6:\n" + "ld1r {v14.4s}, [%[minptr]]\n" + "ld1r {v15.4s}, [%[maxptr]]\n" + "fmax v16.4s, v16.4s, v14.4s\n" + "fmax v17.4s, v17.4s, v14.4s\n" + "fmax v18.4s, v18.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v14.4s\n" + "fmin v16.4s, v16.4s, v15.4s\n" + "fmin v17.4s, v17.4s, v15.4s\n" + "fmin v18.4s, v18.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v15.4s\n" + "str q16, [%[c_ptr0]]\n" + "fmax v20.4s, v20.4s, v14.4s\n" + "fmax v21.4s, v21.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v14.4s\n" + "str q17, [%[c_ptr0], #0x10]\n" + "fmax v23.4s, v23.4s, v14.4s\n" + "fmin v20.4s, v20.4s, v15.4s\n" + "fmin v21.4s, v21.4s, v15.4s\n" + "str q18, [%[c_ptr0], #0x20]\n" + "fmin v22.4s, v22.4s, v15.4s\n" + "fmin v23.4s, v23.4s, v15.4s\n" + "str q19, [%[c_ptr0], #0x30]\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "str q20, [c_ptr1]\n" + "str q21, [c_ptr1, #0x10]\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "cbnz %[append], 1f\n" + "ldr q16, [%[biasptr]]\n" + "ldr q17, [%[biasptr], #0x10]\n" + "ldr q18, [%[biasptr], #0x20]\n" + "ldr q19, [%[biasptr], #0x30]\n" + "mov v20.16b, v16.16b\n" + "ldr q0, [%[a_ptr0]]\n" + "mov v21.16b, v17.16b\n" + "ldr q1, [a_ptr1]\n" + "mov v22.16b, v18.16b\n" + "ldr q2, [a_ptr2]\n" + "mov v23.16b, v19.16b\n" + "ldr q8, [%[b_ptr0]]\n" + "mov v24.16b, v16.16b\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "mov v25.16b, v17.16b\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "mov v26.16b, v18.16b\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov v27.16b, v19.16b\n" + "add a_ptr1, a_ptr1, #0x10\n" + "add a_ptr2, a_ptr2, #0x10\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "ldr q20, [c_ptr1]\n" + "ldr q21, [c_ptr1, #0x10]\n" + "ldr q22, [c_ptr1, #0x20]\n" + "ldr q23, [c_ptr1, #0x30]\n" + "ldr q24, [c_ptr2]\n" + "ldr q25, [c_ptr2, #0x10]\n" + "ldr q26, [c_ptr2, #0x20]\n" + "ldr q27, [c_ptr2, #0x30]\n" + "ldr q0, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q1, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q2, [a_ptr2]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "3:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q6, [a_ptr2]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "ldr q2, [a_ptr2, #-0x10]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "fmla v24.4s, v8.4s, v6.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "fmla v25.4s, v9.4s, v6.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "fmla v26.4s, v10.4s, v6.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "fmla v27.4s, v11.4s, v6.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "fmla v24.4s, v8.4s, v6.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "fmla v25.4s, v9.4s, v6.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "fmla v26.4s, v10.4s, v6.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v6.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "fmla v24.4s, v8.4s, v6.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "fmla v25.4s, v9.4s, v6.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "fmla v26.4s, v10.4s, v6.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "fmla v27.4s, v11.4s, v6.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v24.4s, v8.4s, v6.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v25.4s, v9.4s, v6.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v26.4s, v10.4s, v6.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "fmla v27.4s, v11.4s, v6.s[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "cbz %[regs], 4f\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q6, [a_ptr2]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "fmla v24.4s, v8.4s, v6.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "fmla v25.4s, v9.4s, v6.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "fmla v26.4s, v10.4s, v6.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "fmla v27.4s, v11.4s, v6.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "fmla v24.4s, v8.4s, v6.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "fmla v25.4s, v9.4s, v6.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "fmla v26.4s, v10.4s, v6.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v6.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "fmla v24.4s, v8.4s, v6.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "fmla v25.4s, v9.4s, v6.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "fmla v26.4s, v10.4s, v6.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "fmla v27.4s, v11.4s, v6.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v24.4s, v8.4s, v6.s[3]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v25.4s, v9.4s, v6.s[3]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v26.4s, v10.4s, v6.s[3]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "fmla v27.4s, v11.4s, v6.s[3]\n" + "b 5f\n" + "4:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "5:\n" + "cbz %[blocks], 6f\n" + "7:\n" + "ldr q8, [%[b_ptr0]]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr s0, [%[a_ptr0]]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x4\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr s1, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x4\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "ldr s2, [a_ptr2]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "add a_ptr2, a_ptr2, #0x4\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "b.ne 7b\n" + "6:\n" + "ld1r {v14.4s}, [%[minptr]]\n" + "ld1r {v15.4s}, [%[maxptr]]\n" + "fmax v16.4s, v16.4s, v14.4s\n" + "fmax v17.4s, v17.4s, v14.4s\n" + "fmax v18.4s, v18.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v14.4s\n" + "fmin v16.4s, v16.4s, v15.4s\n" + "fmin v17.4s, v17.4s, v15.4s\n" + "fmin v18.4s, v18.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v15.4s\n" + "str q16, [%[c_ptr0]]\n" + "fmax v20.4s, v20.4s, v14.4s\n" + "fmax v21.4s, v21.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v14.4s\n" + "str q17, [%[c_ptr0], #0x10]\n" + "fmax v23.4s, v23.4s, v14.4s\n" + "fmin v20.4s, v20.4s, v15.4s\n" + "fmin v21.4s, v21.4s, v15.4s\n" + "str q18, [%[c_ptr0], #0x20]\n" + "fmin v22.4s, v22.4s, v15.4s\n" + "fmin v23.4s, v23.4s, v15.4s\n" + "fmax v24.4s, v24.4s, v14.4s\n" + "str q19, [%[c_ptr0], #0x30]\n" + "fmax v25.4s, v25.4s, v14.4s\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "fmax v26.4s, v26.4s, v14.4s\n" + "str q20, [c_ptr1]\n" + "fmin v24.4s, v24.4s, v15.4s\n" + "fmin v25.4s, v25.4s, v15.4s\n" + "fmax v27.4s, v27.4s, v14.4s\n" + "str q21, [c_ptr1, #0x10]\n" + "fmin v26.4s, v26.4s, v15.4s\n" + "fmin v27.4s, v27.4s, v15.4s\n" + "str q22, [c_ptr1, #0x20]\n" + "str q23, [c_ptr1, #0x30]\n" + "str q24, [c_ptr2]\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + default: + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "cbnz %[append], 1f\n" + "ldr q16, [%[biasptr]]\n" + "ldr q17, [%[biasptr], #0x10]\n" + "ldr q18, [%[biasptr], #0x20]\n" + "ldr q19, [%[biasptr], #0x30]\n" + "mov v20.16b, v16.16b\n" + "ldr q0, [%[a_ptr0]]\n" + "mov v21.16b, v17.16b\n" + "ldr q1, [a_ptr1]\n" + "mov v22.16b, v18.16b\n" + "ldr q2, [a_ptr2]\n" + "mov v23.16b, v19.16b\n" + "ldr q3, [a_ptr3]\n" + "mov v24.16b, v16.16b\n" + "ldr q8, [%[b_ptr0]]\n" + "mov v25.16b, v17.16b\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "mov v26.16b, v18.16b\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "mov v27.16b, v19.16b\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "mov v28.16b, v16.16b\n" + "add a_ptr1, a_ptr1, #0x10\n" + "mov v29.16b, v17.16b\n" + "add a_ptr2, a_ptr2, #0x10\n" + "mov v30.16b, v18.16b\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov v31.16b, v19.16b\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ldr q16, [%[c_ptr0]]\n" + "ldr q17, [%[c_ptr0], #0x10]\n" + "ldr q18, [%[c_ptr0], #0x20]\n" + "ldr q19, [%[c_ptr0], #0x30]\n" + "ldr q20, [c_ptr1]\n" + "ldr q21, [c_ptr1, #0x10]\n" + "ldr q22, [c_ptr1, #0x20]\n" + "ldr q23, [c_ptr1, #0x30]\n" + "ldr q24, [c_ptr2]\n" + "ldr q25, [c_ptr2, #0x10]\n" + "ldr q26, [c_ptr2, #0x20]\n" + "ldr q27, [c_ptr2, #0x30]\n" + "ldr q28, [c_ptr3]\n" + "ldr q29, [c_ptr3, #0x10]\n" + "ldr q30, [c_ptr3, #0x20]\n" + "ldr q31, [c_ptr3, #0x30]\n" + "ldr q0, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ldr q1, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "ldr q2, [a_ptr2]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ldr q3, [a_ptr3]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "ldr q8, [%[b_ptr0]]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "cbz %[loops], 2f\n" + "3:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v28.4s, v8.4s, v3.s[0]\n" + "ldr q6, [a_ptr2]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q7, [a_ptr3]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "subs %[loops], %[loops], #0x1\n" + "fmla v29.4s, v9.4s, v3.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" + "fmla v30.4s, v10.4s, v3.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" + "fmla v31.4s, v11.4s, v3.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "fmla v28.4s, v8.4s, v3.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "fmla v29.4s, v9.4s, v3.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "fmla v30.4s, v10.4s, v3.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "fmla v31.4s, v11.4s, v3.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "fmla v28.4s, v8.4s, v3.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "fmla v29.4s, v9.4s, v3.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v30.4s, v10.4s, v3.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "fmla v31.4s, v11.4s, v3.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v3.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "fmla v29.4s, v9.4s, v3.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "fmla v30.4s, v10.4s, v3.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "ldr q0, [%[a_ptr0], #-0x10]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "ldr q1, [a_ptr1, #-0x10]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "ldr q2, [a_ptr2, #-0x10]\n" + "fmla v31.4s, v11.4s, v3.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "ldr q3, [a_ptr3, #-0x10]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "fmla v24.4s, v8.4s, v6.s[0]\n" + "fmla v28.4s, v8.4s, v7.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "fmla v25.4s, v9.4s, v6.s[0]\n" + "fmla v29.4s, v9.4s, v7.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "fmla v26.4s, v10.4s, v6.s[0]\n" + "fmla v30.4s, v10.4s, v7.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "fmla v27.4s, v11.4s, v6.s[0]\n" + "fmla v31.4s, v11.4s, v7.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "fmla v24.4s, v8.4s, v6.s[1]\n" + "fmla v28.4s, v8.4s, v7.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "fmla v25.4s, v9.4s, v6.s[1]\n" + "fmla v29.4s, v9.4s, v7.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "fmla v26.4s, v10.4s, v6.s[1]\n" + "fmla v30.4s, v10.4s, v7.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v6.s[1]\n" + "fmla v31.4s, v11.4s, v7.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "fmla v24.4s, v8.4s, v6.s[2]\n" + "fmla v28.4s, v8.4s, v7.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "fmla v25.4s, v9.4s, v6.s[2]\n" + "fmla v29.4s, v9.4s, v7.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "fmla v26.4s, v10.4s, v6.s[2]\n" + "fmla v30.4s, v10.4s, v7.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "fmla v27.4s, v11.4s, v6.s[2]\n" + "fmla v31.4s, v11.4s, v7.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v24.4s, v8.4s, v6.s[3]\n" + "fmla v28.4s, v8.4s, v7.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v25.4s, v9.4s, v6.s[3]\n" + "fmla v29.4s, v9.4s, v7.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v26.4s, v10.4s, v6.s[3]\n" + "fmla v30.4s, v10.4s, v7.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "fmla v27.4s, v11.4s, v6.s[3]\n" + "fmla v31.4s, v11.4s, v7.s[3]\n" + "b.ne 3b\n" + "2:\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "prfm PSTL1KEEP, [%[c_ptr0]]\n" + "prfm PSTL1KEEP, [c_ptr1]\n" + "prfm PSTL1KEEP, [c_ptr2]\n" + "prfm PSTL1KEEP, [c_ptr3]\n" + "cbz %[regs], 4f\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr q4, [%[a_ptr0]]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "ldr q5, [a_ptr1]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "ldr q6, [a_ptr2]\n" + "fmla v28.4s, v8.4s, v3.s[0]\n" + "ldr q7, [a_ptr3]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "fmla v29.4s, v9.4s, v3.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "fmla v30.4s, v10.4s, v3.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "fmla v31.4s, v11.4s, v3.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "fmla v28.4s, v8.4s, v3.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "fmla v29.4s, v9.4s, v3.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "fmla v30.4s, v10.4s, v3.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "fmla v31.4s, v11.4s, v3.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "fmla v28.4s, v8.4s, v3.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "fmla v29.4s, v9.4s, v3.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v30.4s, v10.4s, v3.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "fmla v31.4s, v11.4s, v3.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v3.s[3]\n" + "ldr q8, [%[b_ptr0], #-0x40]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "fmla v29.4s, v9.4s, v3.s[3]\n" + "ldr q9, [%[b_ptr0], #-0x30]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "fmla v30.4s, v10.4s, v3.s[3]\n" + "ldr q10, [%[b_ptr0], #-0x20]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "fmla v31.4s, v11.4s, v3.s[3]\n" + "ldr q11, [%[b_ptr0], #-0x10]\n" + "fmla v16.4s, v8.4s, v4.s[0]\n" + "fmla v20.4s, v8.4s, v5.s[0]\n" + "fmla v24.4s, v8.4s, v6.s[0]\n" + "fmla v28.4s, v8.4s, v7.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v4.s[0]\n" + "fmla v21.4s, v9.4s, v5.s[0]\n" + "fmla v25.4s, v9.4s, v6.s[0]\n" + "fmla v29.4s, v9.4s, v7.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v4.s[0]\n" + "fmla v22.4s, v10.4s, v5.s[0]\n" + "fmla v26.4s, v10.4s, v6.s[0]\n" + "fmla v30.4s, v10.4s, v7.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v4.s[0]\n" + "fmla v23.4s, v11.4s, v5.s[0]\n" + "fmla v27.4s, v11.4s, v6.s[0]\n" + "fmla v31.4s, v11.4s, v7.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v4.s[1]\n" + "fmla v20.4s, v8.4s, v5.s[1]\n" + "fmla v24.4s, v8.4s, v6.s[1]\n" + "fmla v28.4s, v8.4s, v7.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v4.s[1]\n" + "fmla v21.4s, v9.4s, v5.s[1]\n" + "fmla v25.4s, v9.4s, v6.s[1]\n" + "fmla v29.4s, v9.4s, v7.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v4.s[1]\n" + "fmla v22.4s, v10.4s, v5.s[1]\n" + "fmla v26.4s, v10.4s, v6.s[1]\n" + "fmla v30.4s, v10.4s, v7.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v4.s[1]\n" + "fmla v23.4s, v11.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v6.s[1]\n" + "fmla v31.4s, v11.4s, v7.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v4.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v5.s[2]\n" + "fmla v24.4s, v8.4s, v6.s[2]\n" + "fmla v28.4s, v8.4s, v7.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v4.s[2]\n" + "fmla v21.4s, v9.4s, v5.s[2]\n" + "fmla v25.4s, v9.4s, v6.s[2]\n" + "fmla v29.4s, v9.4s, v7.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v4.s[2]\n" + "fmla v22.4s, v10.4s, v5.s[2]\n" + "fmla v26.4s, v10.4s, v6.s[2]\n" + "fmla v30.4s, v10.4s, v7.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v4.s[2]\n" + "fmla v23.4s, v11.4s, v5.s[2]\n" + "fmla v27.4s, v11.4s, v6.s[2]\n" + "fmla v31.4s, v11.4s, v7.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v4.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v5.s[3]\n" + "fmla v24.4s, v8.4s, v6.s[3]\n" + "fmla v28.4s, v8.4s, v7.s[3]\n" + "fmla v17.4s, v9.4s, v4.s[3]\n" + "fmla v21.4s, v9.4s, v5.s[3]\n" + "fmla v25.4s, v9.4s, v6.s[3]\n" + "fmla v29.4s, v9.4s, v7.s[3]\n" + "fmla v18.4s, v10.4s, v4.s[3]\n" + "fmla v22.4s, v10.4s, v5.s[3]\n" + "fmla v26.4s, v10.4s, v6.s[3]\n" + "fmla v30.4s, v10.4s, v7.s[3]\n" + "fmla v19.4s, v11.4s, v4.s[3]\n" + "fmla v23.4s, v11.4s, v5.s[3]\n" + "fmla v27.4s, v11.4s, v6.s[3]\n" + "fmla v31.4s, v11.4s, v7.s[3]\n" + "b 5f\n" + "4:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "fmla v28.4s, v8.4s, v3.s[0]\n" + "ldr q8, [%[b_ptr0]]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "fmla v29.4s, v9.4s, v3.s[0]\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "fmla v30.4s, v10.4s, v3.s[0]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "fmla v31.4s, v11.4s, v3.s[0]\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "fmla v16.4s, v8.4s, v0.s[1]\n" + "fmla v20.4s, v8.4s, v1.s[1]\n" + "fmla v24.4s, v8.4s, v2.s[1]\n" + "fmla v28.4s, v8.4s, v3.s[1]\n" + "ldr q8, [%[b_ptr0], #0x40]\n" + "fmla v17.4s, v9.4s, v0.s[1]\n" + "fmla v21.4s, v9.4s, v1.s[1]\n" + "fmla v25.4s, v9.4s, v2.s[1]\n" + "fmla v29.4s, v9.4s, v3.s[1]\n" + "ldr q9, [%[b_ptr0], #0x50]\n" + "fmla v18.4s, v10.4s, v0.s[1]\n" + "fmla v22.4s, v10.4s, v1.s[1]\n" + "fmla v26.4s, v10.4s, v2.s[1]\n" + "fmla v30.4s, v10.4s, v3.s[1]\n" + "ldr q10, [%[b_ptr0], #0x60]\n" + "fmla v19.4s, v11.4s, v0.s[1]\n" + "fmla v23.4s, v11.4s, v1.s[1]\n" + "fmla v27.4s, v11.4s, v2.s[1]\n" + "fmla v31.4s, v11.4s, v3.s[1]\n" + "ldr q11, [%[b_ptr0], #0x70]\n" + "fmla v16.4s, v8.4s, v0.s[2]\n" + "add %[b_ptr0], %[b_ptr0], #0x100\n" + "fmla v20.4s, v8.4s, v1.s[2]\n" + "fmla v24.4s, v8.4s, v2.s[2]\n" + "fmla v28.4s, v8.4s, v3.s[2]\n" + "ldr q8, [%[b_ptr0], #-0x80]\n" + "fmla v17.4s, v9.4s, v0.s[2]\n" + "fmla v21.4s, v9.4s, v1.s[2]\n" + "fmla v25.4s, v9.4s, v2.s[2]\n" + "fmla v29.4s, v9.4s, v3.s[2]\n" + "ldr q9, [%[b_ptr0], #-0x70]\n" + "fmla v18.4s, v10.4s, v0.s[2]\n" + "fmla v22.4s, v10.4s, v1.s[2]\n" + "fmla v26.4s, v10.4s, v2.s[2]\n" + "fmla v30.4s, v10.4s, v3.s[2]\n" + "ldr q10, [%[b_ptr0], #-0x60]\n" + "fmla v19.4s, v11.4s, v0.s[2]\n" + "fmla v23.4s, v11.4s, v1.s[2]\n" + "fmla v27.4s, v11.4s, v2.s[2]\n" + "fmla v31.4s, v11.4s, v3.s[2]\n" + "ldr q11, [%[b_ptr0], #-0x50]\n" + "fmla v16.4s, v8.4s, v0.s[3]\n" + "add %[b_ptr0], %[b_ptr0], #-0x40\n" + "fmla v20.4s, v8.4s, v1.s[3]\n" + "fmla v24.4s, v8.4s, v2.s[3]\n" + "fmla v28.4s, v8.4s, v3.s[3]\n" + "fmla v17.4s, v9.4s, v0.s[3]\n" + "fmla v21.4s, v9.4s, v1.s[3]\n" + "fmla v25.4s, v9.4s, v2.s[3]\n" + "fmla v29.4s, v9.4s, v3.s[3]\n" + "fmla v18.4s, v10.4s, v0.s[3]\n" + "fmla v22.4s, v10.4s, v1.s[3]\n" + "fmla v26.4s, v10.4s, v2.s[3]\n" + "fmla v30.4s, v10.4s, v3.s[3]\n" + "fmla v19.4s, v11.4s, v0.s[3]\n" + "fmla v23.4s, v11.4s, v1.s[3]\n" + "fmla v27.4s, v11.4s, v2.s[3]\n" + "fmla v31.4s, v11.4s, v3.s[3]\n" + "5:\n" + "cbz %[blocks], 6f\n" + "7:\n" + "ldr q8, [%[b_ptr0]]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ldr q9, [%[b_ptr0], #0x10]\n" + "ldr s0, [%[a_ptr0]]\n" + "ldr q10, [%[b_ptr0], #0x20]\n" + "add %[a_ptr0], %[a_ptr0], #0x4\n" + "ldr q11, [%[b_ptr0], #0x30]\n" + "add %[b_ptr0], %[b_ptr0], #0x40\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "ldr s1, [a_ptr1]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "add a_ptr1, a_ptr1, #0x4\n" + "fmla v18.4s, v10.4s, v0.s[0]\n" + "ldr s2, [a_ptr2]\n" + "fmla v20.4s, v8.4s, v1.s[0]\n" + "add a_ptr2, a_ptr2, #0x4\n" + "fmla v21.4s, v9.4s, v1.s[0]\n" + "ldr s3, [a_ptr3]\n" + "fmla v24.4s, v8.4s, v2.s[0]\n" + "add a_ptr3, a_ptr3, #0x4\n" + "fmla v25.4s, v9.4s, v2.s[0]\n" + "fmla v28.4s, v8.4s, v3.s[0]\n" + "fmla v29.4s, v9.4s, v3.s[0]\n" + "fmla v22.4s, v10.4s, v1.s[0]\n" + "fmla v26.4s, v10.4s, v2.s[0]\n" + "fmla v30.4s, v10.4s, v3.s[0]\n" + "fmla v19.4s, v11.4s, v0.s[0]\n" + "fmla v23.4s, v11.4s, v1.s[0]\n" + "fmla v27.4s, v11.4s, v2.s[0]\n" + "fmla v31.4s, v11.4s, v3.s[0]\n" + "b.ne 7b\n" + "6:\n" + "ld1r {v14.4s}, [%[minptr]]\n" + "ld1r {v15.4s}, [%[maxptr]]\n" + "fmax v16.4s, v16.4s, v14.4s\n" + "fmax v17.4s, v17.4s, v14.4s\n" + "fmax v18.4s, v18.4s, v14.4s\n" + "fmax v19.4s, v19.4s, v14.4s\n" + "fmin v16.4s, v16.4s, v15.4s\n" + "fmin v17.4s, v17.4s, v15.4s\n" + "fmin v18.4s, v18.4s, v15.4s\n" + "fmin v19.4s, v19.4s, v15.4s\n" + "str q16, [%[c_ptr0]]\n" + "fmax v20.4s, v20.4s, v14.4s\n" + "fmax v21.4s, v21.4s, v14.4s\n" + "fmax v22.4s, v22.4s, v14.4s\n" + "str q17, [%[c_ptr0], #0x10]\n" + "fmax v23.4s, v23.4s, v14.4s\n" + "fmin v20.4s, v20.4s, v15.4s\n" + "fmin v21.4s, v21.4s, v15.4s\n" + "str q18, [%[c_ptr0], #0x20]\n" + "fmin v22.4s, v22.4s, v15.4s\n" + "fmin v23.4s, v23.4s, v15.4s\n" + "fmax v24.4s, v24.4s, v14.4s\n" + "str q19, [%[c_ptr0], #0x30]\n" + "fmax v25.4s, v25.4s, v14.4s\n" + "add %[c_ptr0], %[c_ptr0], #0x40\n" + "fmax v26.4s, v26.4s, v14.4s\n" + "str q20, [c_ptr1]\n" + "fmin v24.4s, v24.4s, v15.4s\n" + "fmin v25.4s, v25.4s, v15.4s\n" + "fmax v27.4s, v27.4s, v14.4s\n" + "str q21, [c_ptr1, #0x10]\n" + "fmin v26.4s, v26.4s, v15.4s\n" + "fmax v28.4s, v28.4s, v14.4s\n" + "fmax v29.4s, v29.4s, v14.4s\n" + "str q22, [c_ptr1, #0x20]\n" + "fmin v27.4s, v27.4s, v15.4s\n" + "fmax v30.4s, v30.4s, v14.4s\n" + "fmin v28.4s, v28.4s, v15.4s\n" + "str q23, [c_ptr1, #0x30]\n" + "fmin v29.4s, v29.4s, v15.4s\n" + "fmax v31.4s, v31.4s, v14.4s\n" + "fmin v30.4s, v30.4s, v15.4s\n" + "str q24, [c_ptr2]\n" + "fmin v31.4s, v31.4s, v15.4s\n" + "str q25, [c_ptr2, #0x10]\n" + "str q26, [c_ptr2, #0x20]\n" + "str q27, [c_ptr2, #0x30]\n" + "str q28, [c_ptr3]\n" + "str q29, [c_ptr3, #0x10]\n" + "str q30, [c_ptr3, #0x20]\n" + "str q31, [c_ptr3, #0x30]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + } + if (use_result_buffer) { + for(int cy=0; cy<std::min(M-y, 4); cy++) { + for(unsigned int cx=0; cx<width; cx++) { + c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx]; + } + } + } + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp index da5beef48c..d11a945d27 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,9 +78,9 @@ public: // Default to the generic kernel kern_type kernel=a64_hybrid_fp32_mla_4x8; - hybrid_fp32_mla_4x8(const CPUInfo *ci) + hybrid_fp32_mla_4x8(const CPUInfo *) { - UNUSED(ci); + } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp index db7eb83160..731230364d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,12 +61,23 @@ void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, break; } - for (int y=0; y<M; y+=8) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const float * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(float); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 8) { + if (rows_to_compute % 8) { + rows_to_compute = 8 - 1; + } else { + rows_to_compute = 8; + } + } + for (int x0=0; x0<N; x0+=4ul) { const long width = std::min((unsigned long)N-x0, 4ul); long loops = loops_count; @@ -90,7 +101,7 @@ void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, } const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "ldr q24, [%[biasptr]]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp index bdc62ea181..4a9f7985b7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp index 7c08aa2165..6c7e89559c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,9 +32,7 @@ namespace arm_gemm { -void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool append) { - UNUSED(bias); - UNUSED(act); +void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) { const int K_stride = ((K + 3) / 4) * 4; const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; @@ -43,12 +41,23 @@ void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, in const long blocks_count = K / 4; const long odds_count = K - (blocks_count * 4); - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const int8_t * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(int8_t); int32_t *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=16ul) { const long width = std::min((unsigned long)N-x0, 16ul); long loops = loops_count; @@ -72,7 +81,7 @@ void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, in c_ptr0 = result_buffer; } - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "temploadreg0 .req X0\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp index 9f06a48ff5..797ab74498 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,9 +32,7 @@ namespace arm_gemm { -void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool append) { - UNUSED(bias); - UNUSED(act); +void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) { const int K_stride = ((K + 3) / 4) * 4; const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; @@ -43,12 +41,23 @@ void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_ const long blocks_count = K / 4; const long odds_count = K - (blocks_count * 4); - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const int8_t * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(int8_t); int32_t *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=16ul) { const long width = std::min((unsigned long)N-x0, 16ul); long loops = loops_count; @@ -72,7 +81,7 @@ void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_ c_ptr0 = result_buffer; } - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "cbnz %[append], 1f\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp index 5295650e7b..cdeb5e8b36 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp index e8ed0c311e..91870e2e54 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,10 +32,7 @@ namespace arm_gemm { -void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool append) { - UNUSED(bias); - UNUSED(act); - +void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) { const int K_stride = ((K + 3) / 4) * 4; const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; @@ -44,12 +41,23 @@ void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, const long blocks_count = K / 4; const long odds_count = K - (blocks_count * 4); - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const uint8_t * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(uint8_t); uint32_t *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=16ul) { const long width = std::min((unsigned long)N-x0, 16ul); long loops = loops_count; @@ -73,7 +81,7 @@ void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, c_ptr0 = result_buffer; } - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "temploadreg0 .req X0\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp index 23d919a64c..0436547af0 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,10 +32,7 @@ namespace arm_gemm { -void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool append) { - UNUSED(bias); - UNUSED(act); - +void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) { const int K_stride = ((K + 3) / 4) * 4; const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; @@ -44,12 +41,23 @@ void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint const long blocks_count = K / 4; const long odds_count = K - (blocks_count * 4); - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const uint8_t * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(uint8_t); uint32_t *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=16ul) { const long width = std::min((unsigned long)N-x0, 16ul); long loops = loops_count; @@ -73,7 +81,7 @@ void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint c_ptr0 = result_buffer; } - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "cbnz %[append], 1f\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp index 0f6c34500d..95fed86c2f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,6 +32,7 @@ namespace arm_gemm { // Actual kernel implementations void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int); +void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *, const bfloat16 *, float *, int, int, int); class interleaved_bf16fp32_dot_12x8 { public: @@ -61,7 +62,12 @@ public: kern_type kernel=a64_interleaved_bf16fp32_dot_12x8; - interleaved_bf16fp32_dot_12x8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_bf16fp32_dot_12x8(const CPUInfo *ci) + { + if (ci->get_cpu_model() == CPUModel::X1) { + kernel = a64_interleaved_bf16fp32_dot_12x8_x1; + } + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp index 8ce6a601fd..7ffae524dc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -57,13 +57,11 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B "movi v12.4s, #0\n" "ldr q2, [%[a_ptr], #0x20]\n" "movi v13.4s, #0\n" - "ldr q6, [%[b_ptr], #0x20]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "movi v14.4s, #0\n" - "ldr q3, [%[a_ptr], #0x30]\n" + "add %[b_ptr], %[b_ptr], #0x30\n" "movi v15.4s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "movi v16.4s, #0\n" - "add %[b_ptr], %[b_ptr], #0x30\n" "movi v17.4s, #0\n" "movi v18.4s, #0\n" "movi v19.4s, #0\n" @@ -82,9 +80,11 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B "cbz %[loops], 1f\n" "2:\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n" + "subs %[loops], %[loops], #0x1\n" ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n" ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n" ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n" @@ -140,13 +140,13 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n" ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n" ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n" - "ldr q6, [%[b_ptr], #-0x10]\n" - "ldr q3, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n" ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n" ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n" @@ -178,12 +178,13 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n" "add %[a_ptr], %[a_ptr], #0x20\n" ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n" + "add %[b_ptr], %[b_ptr], #0x60\n" ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n" ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n" ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n" ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n" ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n" - "ldr q4, [%[b_ptr], #0x30]\n" + "ldr q4, [%[b_ptr], #-0x30]\n" ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n" ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n" ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n" @@ -192,7 +193,7 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n" ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n" ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n" - "ldr q5, [%[b_ptr], #0x40]\n" + "ldr q5, [%[b_ptr], #-0x20]\n" ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n" ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n" ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n" @@ -201,13 +202,12 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n" ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n" ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n" - "ldr q6, [%[b_ptr], #0x50]\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" - "add %[b_ptr], %[b_ptr], #0x60\n" ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n" ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n" - "str q8, [%[c_ptr]]\n" ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n" + "str q8, [%[c_ptr]]\n" ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n" ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n" ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n" @@ -234,14 +234,17 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B "b 4f\n" "3:\n" ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n" + "add %[b_ptr], %[b_ptr], #0x30\n" ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n" ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n" ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n" ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n" ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n" - "ldr q4, [%[b_ptr]]\n" + "ldr q4, [%[b_ptr], #-0x30]\n" ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n" ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n" ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n" @@ -250,7 +253,7 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n" ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n" ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n" - "ldr q5, [%[b_ptr], #0x10]\n" + "ldr q5, [%[b_ptr], #-0x20]\n" ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n" ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n" ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n" @@ -259,13 +262,12 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n" ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n" ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n" - "ldr q6, [%[b_ptr], #0x20]\n" + "ldr q6, [%[b_ptr], #-0x10]\n" ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n" - "add %[b_ptr], %[b_ptr], #0x30\n" ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n" ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n" - "str q8, [%[c_ptr]]\n" ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n" + "str q8, [%[c_ptr]]\n" ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n" ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n" ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp new file mode 100644 index 0000000000..58a51432fd --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include "../../bfloat.hpp" +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const bfloat16 *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 2; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb<ablocks; yb++) { + const bfloat16 *a_ptr0 = a_ptr; + const bfloat16 *b_ptr = Bpanel; + + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + long loops = loops_count; + long tails = tails_count; + + __asm __volatile ( + "movi v8.4s, #0\n" + "ldr q0, [%[a_ptr]]\n" + "movi v9.4s, #0\n" + "ldr q2, [%[b_ptr]]\n" + "movi v10.4s, #0\n" + "ldr q1, [%[a_ptr], #0x10]\n" + "movi v11.4s, #0\n" + "ldr q3, [%[b_ptr], #0x10]\n" + "movi v12.4s, #0\n" + "ldr q4, [%[b_ptr], #0x20]\n" + "movi v13.4s, #0\n" + "add %[a_ptr], %[a_ptr], #0x20\n" + "movi v14.4s, #0\n" + "add %[b_ptr], %[b_ptr], #0x30\n" + "movi v15.4s, #0\n" + "movi v16.4s, #0\n" + "movi v17.4s, #0\n" + "movi v18.4s, #0\n" + "movi v19.4s, #0\n" + "movi v20.4s, #0\n" + "movi v21.4s, #0\n" + "movi v22.4s, #0\n" + "movi v23.4s, #0\n" + "movi v24.4s, #0\n" + "movi v25.4s, #0\n" + "movi v26.4s, #0\n" + "movi v27.4s, #0\n" + "movi v28.4s, #0\n" + "movi v29.4s, #0\n" + "movi v30.4s, #0\n" + "movi v31.4s, #0\n" + "cbz %[loops], 1f\n" + "2:\n" + ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n" + ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n" + ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n" + ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n" + ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n" + ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n" + ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n" + "ldr q2, [%[b_ptr]]\n" + ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n" + ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n" + ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n" + ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n" + ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n" + ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n" + ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n" + ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n" + "ldr q3, [%[b_ptr], #0x10]\n" + ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n" + ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n" + ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n" + ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n" + "ldr q0, [%[a_ptr]]\n" + ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n" + ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n" + ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n" + ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n" + "ldr q4, [%[b_ptr], #0x20]\n" + ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n" + "ldr q1, [%[a_ptr], #0x10]\n" + ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n" + "add %[b_ptr], %[b_ptr], #0x60\n" + ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n" + ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n" + ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n" + ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n" + ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n" + "ldr q2, [%[b_ptr], #-0x30]\n" + ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n" + ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n" + ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n" + ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n" + ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n" + ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n" + ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n" + ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n" + "ldr q3, [%[b_ptr], #-0x20]\n" + ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n" + ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n" + ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n" + ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n" + "ldr q0, [%[a_ptr], #-0x20]\n" + ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n" + ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n" + ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n" + ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n" + "ldr q4, [%[b_ptr], #-0x10]\n" + "ldr q1, [%[a_ptr], #-0x10]\n" + "b.ne 2b\n" + "1:\n" + "cbz %[tails], 3f\n" + ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n" + ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n" + ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n" + ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n" + ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n" + ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n" + ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n" + ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n" + "ldr q2, [%[b_ptr]]\n" + ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n" + ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n" + ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n" + ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n" + ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n" + ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n" + ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n" + ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n" + "ldr q3, [%[b_ptr], #0x10]\n" + ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n" + ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n" + ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n" + ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n" + "ldr q0, [%[a_ptr]]\n" + ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n" + ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n" + ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n" + ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n" + "ldr q4, [%[b_ptr], #0x20]\n" + ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n" + "ldr q1, [%[a_ptr], #0x10]\n" + ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n" + "add %[b_ptr], %[b_ptr], #0x60\n" + ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n" + ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n" + ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n" + ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n" + ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n" + "ldr q2, [%[b_ptr], #-0x30]\n" + ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n" + ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n" + ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n" + ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n" + ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n" + ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n" + ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n" + ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n" + "ldr q3, [%[b_ptr], #-0x20]\n" + ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n" + ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n" + ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n" + ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n" + "ldr q0, [%[a_ptr], #-0x20]\n" + ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n" + ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n" + ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n" + ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n" + "ldr q4, [%[b_ptr], #-0x10]\n" + ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n" + "ldr q1, [%[a_ptr], #-0x10]\n" + ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n" + ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n" + ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n" + "str q8, [%[c_ptr]]\n" + ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n" + ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n" + ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n" + ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n" + ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n" + ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n" + ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n" + ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n" + "str q12, [%[c_ptr], #0x10]\n" + ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n" + ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n" + ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n" + ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n" + ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n" + ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n" + ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n" + ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n" + "str q16, [%[c_ptr], #0x20]\n" + ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n" + ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n" + ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n" + "str q9, [%[c_ptr], #0x30]\n" + ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n" + "b 4f\n" + "3:\n" + ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n" + "add %[a_ptr], %[a_ptr], #0x20\n" + ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n" + "add %[b_ptr], %[b_ptr], #0x30\n" + ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n" + ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n" + ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n" + ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n" + ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n" + ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n" + "ldr q2, [%[b_ptr], #-0x30]\n" + ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n" + ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n" + ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n" + ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n" + ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n" + ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n" + ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n" + ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n" + "ldr q3, [%[b_ptr], #-0x20]\n" + ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n" + ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n" + ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n" + ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n" + "ldr q0, [%[a_ptr], #-0x20]\n" + ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n" + ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n" + ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n" + ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n" + "ldr q4, [%[b_ptr], #-0x10]\n" + ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n" + "ldr q1, [%[a_ptr], #-0x10]\n" + ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n" + ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n" + ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n" + "str q8, [%[c_ptr]]\n" + ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n" + ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n" + ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n" + ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n" + ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n" + ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n" + ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n" + ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n" + "str q12, [%[c_ptr], #0x10]\n" + ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n" + ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n" + ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n" + ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n" + ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n" + ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n" + ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n" + ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n" + "str q16, [%[c_ptr], #0x20]\n" + ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n" + ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n" + ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n" + "str q9, [%[c_ptr], #0x30]\n" + ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n" + "4:\n" + "str q13, [%[c_ptr], #0x40]\n" + "str q17, [%[c_ptr], #0x50]\n" + "str q10, [%[c_ptr], #0x60]\n" + "str q14, [%[c_ptr], #0x70]\n" + "str q18, [%[c_ptr], #0x80]\n" + "str q11, [%[c_ptr], #0x90]\n" + "str q15, [%[c_ptr], #0xa0]\n" + "str q19, [%[c_ptr], #0xb0]\n" + "str q20, [%[c_ptr], #0xc0]\n" + "str q24, [%[c_ptr], #0xd0]\n" + "str q28, [%[c_ptr], #0xe0]\n" + "str q21, [%[c_ptr], #0xf0]\n" + "str q25, [%[c_ptr], #0x100]\n" + "str q29, [%[c_ptr], #0x110]\n" + "str q22, [%[c_ptr], #0x120]\n" + "str q26, [%[c_ptr], #0x130]\n" + "str q30, [%[c_ptr], #0x140]\n" + "str q23, [%[c_ptr], #0x150]\n" + "str q27, [%[c_ptr], #0x160]\n" + "str q31, [%[c_ptr], #0x170]\n" + "add %[c_ptr], %[c_ptr], #0x180\n" + : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [loops] "+r" (loops), [tails] "+r" (tails) + : + : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" + ); + } + } +} + +} // namespace arm_gemm + +#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp index 7f928fa727..7fac59947e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=a64_interleaved_bf16fp32_mmla_12x8; - interleaved_bf16fp32_mmla_12x8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_bf16fp32_mmla_12x8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp index be87f442ea..7f0eff29af 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -59,42 +59,65 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 * "movi v13.4s, #0\n" "ldr q6, [%[b_ptr], #0x20]\n" "movi v14.4s, #0\n" - "ldr q3, [%[a_ptr], #0x30]\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x40]\n" "movi v15.4s, #0\n" - "ldr q7, [%[b_ptr], #0x30]\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x40]\n" "movi v16.4s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x80]\n" "movi v17.4s, #0\n" - "add %[b_ptr], %[b_ptr], #0x40\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x80]\n" "movi v18.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0xc0]\n" "movi v19.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0xc0]\n" "movi v20.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x100]\n" "movi v21.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x100]\n" "movi v22.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x140]\n" "movi v23.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x140]\n" "movi v24.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x180]\n" "movi v25.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x180]\n" "movi v26.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n" "movi v27.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n" "movi v28.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n" "movi v29.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n" "movi v30.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n" "movi v31.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + "add %[b_ptr], %[b_ptr], #0x40\n" "cbz %[loops], 1f\n" "2:\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "subs %[loops], %[loops], #0x1\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n" ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" "ldr q4, [%[b_ptr]]\n" - ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n" ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n" ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n" "ldr q5, [%[b_ptr], #0x10]\n" ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n" ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n" ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" "ldr q6, [%[b_ptr], #0x20]\n" @@ -151,18 +174,18 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 * ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" "ldr q2, [%[a_ptr], #-0x20]\n" ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" - "ldr q7, [%[b_ptr], #-0x10]\n" - "ldr q3, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" - "ldr q4, [%[b_ptr]]\n" ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n" + ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" + "ldr q4, [%[b_ptr]]\n" ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n" "ldr q5, [%[b_ptr], #0x10]\n" @@ -268,13 +291,15 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 * "b 4f\n" "3:\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" - "add %[b_ptr], %[b_ptr], #0x80\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" + "add %[b_ptr], %[b_ptr], #0x80\n" ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" "ldr q4, [%[b_ptr], #-0x80]\n" - ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n" ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp index f669b870c6..7bfb2291a9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,9 +61,9 @@ public: kern_type kernel=a64_interleaved_s8s32_mmla_12x8; - interleaved_s8s32_mmla_12x8(const CPUInfo *ci) + interleaved_s8s32_mmla_12x8(const CPUInfo *) { - UNUSED(ci); + } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp index 49dbdb866e..7953510aa7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -59,13 +59,11 @@ void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, "movi v13.4s, #0\n" "ldr q6, [%[b_ptr], #0x20]\n" "movi v14.4s, #0\n" - "ldr q3, [%[a_ptr], #0x30]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "movi v15.4s, #0\n" - "ldr q7, [%[b_ptr], #0x30]\n" + "add %[b_ptr], %[b_ptr], #0x40\n" "movi v16.4s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "movi v17.4s, #0\n" - "add %[b_ptr], %[b_ptr], #0x40\n" "movi v18.4s, #0\n" "movi v19.4s, #0\n" "movi v20.4s, #0\n" @@ -83,12 +81,14 @@ void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, "cbz %[loops], 1f\n" "2:\n" ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n" - "subs %[loops], %[loops], #0x1\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n" ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n" "ldr q4, [%[b_ptr]]\n" - ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n" ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n" ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n" ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" @@ -151,18 +151,18 @@ void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n" "ldr q2, [%[a_ptr], #-0x20]\n" ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n" - "ldr q7, [%[b_ptr], #-0x10]\n" - "ldr q3, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n" - ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n" - "ldr q4, [%[b_ptr]]\n" ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n" ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n" + ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n" + "ldr q4, [%[b_ptr]]\n" ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n" ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" "ldr q5, [%[b_ptr], #0x10]\n" @@ -268,13 +268,15 @@ void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, "b 4f\n" "3:\n" ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n" - "add %[b_ptr], %[b_ptr], #0x80\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n" + "add %[b_ptr], %[b_ptr], #0x80\n" ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n" "ldr q4, [%[b_ptr], #-0x80]\n" - ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n" ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n" ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n" ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp index d66edd832a..d493517cf1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,9 +61,9 @@ public: kern_type kernel=a64_interleaved_u8u32_mmla_12x8; - interleaved_u8u32_mmla_12x8(const CPUInfo *ci) + interleaved_u8u32_mmla_12x8(const CPUInfo *) { - UNUSED(ci); + } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp index e182a425f4..dcd15f0345 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -59,13 +59,11 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane "movi v13.4s, #0\n" "ldr q6, [%[b_ptr], #0x20]\n" "movi v14.4s, #0\n" - "ldr q3, [%[a_ptr], #0x30]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "movi v15.4s, #0\n" - "ldr q7, [%[b_ptr], #0x30]\n" + "add %[b_ptr], %[b_ptr], #0x40\n" "movi v16.4s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "movi v17.4s, #0\n" - "add %[b_ptr], %[b_ptr], #0x40\n" "movi v18.4s, #0\n" "movi v19.4s, #0\n" "movi v20.4s, #0\n" @@ -83,12 +81,14 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane "cbz %[loops], 1f\n" "2:\n" ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" - "subs %[loops], %[loops], #0x1\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" "ldr q4, [%[b_ptr]]\n" - ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n" ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" @@ -151,18 +151,18 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" "ldr q2, [%[a_ptr], #-0x20]\n" ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n" - "ldr q7, [%[b_ptr], #-0x10]\n" - "ldr q3, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" - ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" - "ldr q4, [%[b_ptr]]\n" ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n" + ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" + "ldr q4, [%[b_ptr]]\n" ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" "ldr q5, [%[b_ptr], #0x10]\n" @@ -268,13 +268,15 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane "b 4f\n" "3:\n" ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" - "add %[b_ptr], %[b_ptr], #0x80\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" + "add %[b_ptr], %[b_ptr], #0x80\n" ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" "ldr q4, [%[b_ptr], #-0x80]\n" - ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n" ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp index a86e8ec068..d7bf43deca 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp @@ -25,6 +25,9 @@ #ifdef __aarch64__ + + + namespace arm_gemm { @@ -75,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=a64_native_fp32_mla_16x4; - native_fp32_mla_16x4(const CPUInfo *ci) { UNUSED(ci); } + native_fp32_mla_16x4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp index ddc97b47f4..3eff767d6c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp @@ -34,6 +34,7 @@ void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int); void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int); void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int); void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int); +void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, int); // 12x8 SGEMM "strategy" class. // @@ -83,6 +84,10 @@ public: kernel = a64_sgemm_asimd_12x8_a55r1; break; + case CPUModel::X1: + kernel = a64_sgemm_asimd_12x8_x1; + break; + default: /* Generic kernel is initialized by default. */ break; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp new file mode 100644 index 0000000000..63fdf4df9f --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __aarch64__ + +#include <arm_neon.h> + +#include "../../asmlib.hpp" + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +namespace arm_gemm { + +void a64_sgemm_asimd_12x8_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb<ablocks; yb++) { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K+1)/2) - 1; + + register float32x4_t a0 asm("v0"); + register float32x4_t a1 asm("v1"); + register float32x4_t b0 asm("v2"); + register float32x4_t b1 asm("v3"); + register float32x4_t b2 asm("v4"); + + __asm __volatile ( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v13.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v14.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v17.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v19.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Loop proper + "1:\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "ldr %q[a0], [%[a_ptr], #32]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "ldr %q[a1], [%[a_ptr], #48]\n" + + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + ASM_PREFETCH("[%[b_ptr], #512]") + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "subs %w[k], %w[k], #1\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "ldr %q[b1], [%[b_ptr], #112]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "ldr %q[a0], [%[a_ptr]]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "ldr %q[a0], [%[a_ptr], #-32]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "ldr %q[a1], [%[a_ptr], #-16]\n" + + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "ldr %q[b2], [%[b_ptr], #-16]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "str q8, [%[c_ptr], #0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "str q16, [%[c_ptr], #16]\n" + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "str q24, [%[c_ptr], #32]\n" + + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "str q9, [%[c_ptr], #48]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "str q17, [%[c_ptr], #64]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "str q18, [%[c_ptr], #112]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "str q19, [%[c_ptr], #160]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "str q20, [%[c_ptr], #208]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "str q21, [%[c_ptr], #256]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "str q22, [%[c_ptr], #304]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "str q8, [%[c_ptr], #0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "str q16, [%[c_ptr], #16]\n" + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "str q9, [%[c_ptr], #48]\n" + + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "str q17, [%[c_ptr], #64]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "str q18, [%[c_ptr], #112]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "str q19, [%[c_ptr], #160]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "str q20, [%[c_ptr], #208]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "str q21, [%[c_ptr], #256]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "str q22, [%[c_ptr], #304]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "str q15, [%[c_ptr], #336]\n" + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a1] "+w" (a1), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); + } + } +} + +} // namespace arm_gemm + +#endif diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp index a7162c9f5b..d24bf5fa10 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017,2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -69,7 +69,7 @@ public: kern_type kernel = a64_sgemv_pretransposed; - sgemv_pretransposed(const CPUInfo *ci) { UNUSED(ci); } + sgemv_pretransposed(const CPUInfo *) { } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp index 36f84d89fc..7592798b0d 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017,2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -49,7 +49,7 @@ public: kern_type kernel=a64_sgemv_trans; - sgemv_trans(const CPUInfo *ci) { UNUSED(ci); } + sgemv_trans(const CPUInfo *) { } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp index 352a147282..477f3005e6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp @@ -75,7 +75,10 @@ public: // Default to the generic kernel kern_type kernel=a64_smallK_hybrid_fp32_mla_4x6; - smallK_hybrid_fp32_mla_4x6(const CPUInfo *ci) { UNUSED(ci); } + smallK_hybrid_fp32_mla_4x6(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp index c5d39cbc87..1a0358b787 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp @@ -25,8 +25,6 @@ #ifdef __aarch64__ - - namespace arm_gemm { @@ -77,7 +75,10 @@ public: // Default to the generic kernel kern_type kernel=a64_smallK_hybrid_fp32_mla_4x8; - smallK_hybrid_fp32_mla_4x8(const CPUInfo *ci) { UNUSED(ci); } + smallK_hybrid_fp32_mla_4x8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp index ac9a8d257c..efc109fb34 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_bf16fp32_dot_4VLx4; - hybrid_bf16fp32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_bf16fp32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp index 1ee7b1cf55..f16f452739 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const bfloat16 * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(bfloat16); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>())); long loops = loops_count; @@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp index d889f99f8f..551c6f3a8c 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_bf16fp32_mmla_4VLx4; - hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp index e3debe508d..4b67d747e2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 * break; } - for (int y=0; y<M; y+=8) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const bfloat16 * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(bfloat16); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 8) { + if (rows_to_compute % 8) { + rows_to_compute = 8 - 1; + } else { + rows_to_compute = 8; + } + } + for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) { const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>())); long loops = loops_count; @@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 * const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp index affcafe4aa..6f26fd1404 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_bf16fp32_mmla_6VLx2; - hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *ci) { UNUSED(ci); } + hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp index 07ecbf35cd..fb943fe6fe 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 * break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const bfloat16 * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(bfloat16); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(3 * get_vector_length<float>())) { const long width = std::min((unsigned long)N-x0, (3 * get_vector_length<float>())); long loops = loops_count; @@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 * const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp index af8babd113..0bf4492fdc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_bf16fp32_mmla_8VLx2; - hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *ci) { UNUSED(ci); } + hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp index 73196164a7..3f201f0656 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,12 +62,23 @@ void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 * break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const bfloat16 * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(bfloat16); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>())); long loops = loops_count; @@ -79,7 +90,7 @@ void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 * const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp index 28ef8071c2..fb27b7e103 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_fp16_mla_4VLx4; - hybrid_fp16_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_fp16_mla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp index 2998f33d87..3aef916ad2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,12 +61,23 @@ void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const __fp16 * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(__fp16); __fp16 *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>())); long loops = loops_count; @@ -78,7 +89,7 @@ void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 const unsigned long ldcb = ldc * sizeof(__fp16); const __fp16 *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp index 8e3c17917b..28e00305f7 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_fp32_mla_4VLx4; - hybrid_fp32_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_fp32_mla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp index 855d27a151..6b55959e2a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,12 +61,23 @@ void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const float * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(float); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>())); long loops = loops_count; @@ -78,7 +89,7 @@ void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.s, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp new file mode 100644 index 0000000000..4bdf4e1d80 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +#include "../std_transforms_sve.hpp" + +namespace arm_gemm +{ + +// Actual kernel implementations +void sve_hybrid_fp32_mmla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); + +class hybrid_fp32_mmla_4VLx4 +{ +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool); + + /* Kernel blocking parameters */ + static constexpr unsigned int out_height() + { + return 8; + } + + static unsigned int out_width() + { + return get_vector_length<float>() * 2; + } + + static constexpr unsigned int k_unroll() + { + return 2; + } + + static constexpr bool supports_append() + { + return true; + } + + static constexpr bool supports_bias() + { + return true; + } + + static constexpr bool supports_activation() + { + return true; + } + + StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {}; + + // Default to the generic kernel + kern_type kernel=sve_hybrid_fp32_mmla_4VLx4; + + hybrid_fp32_mmla_4VLx4(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp new file mode 100644 index 0000000000..d8ed307c4b --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp @@ -0,0 +1,3459 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + +#include <algorithm> + +#include "arm_gemm.hpp" + +#include "../../asmlib.hpp" +#include "../../utils.hpp" + +namespace arm_gemm { + +void sve_hybrid_fp32_mmla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) { + const int K_stride = ((K + 1) / 2) * 2; + const long loops_count = ((K + 4) / 8) - 1; + K -= loops_count * 8; + const long regs_count = (K / 4) - 1; + K -= (regs_count + 1) * 4; + const long leftovers = K; + const long blocks_count = (K + 1) / 2; + float nullbias[128]; + if (!append && !bias) { + memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float))); + } + float minval = - static_cast<float>(std::numeric_limits<float>::infinity()); + float maxval = static_cast<float>(std::numeric_limits<float>::infinity()); + const float * const minptr = &minval; + const float * const maxptr = &maxval; + + switch(act.type) + { + default: + case Activation::Type::None: + break; + case Activation::Type::BoundedReLU: + maxval = static_cast<float>(act.param1); + /* fall through */ + case Activation::Type::ReLU: + minval = 0.0f; + break; + } + + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { + const float * const a_ptr0_base = A + (y * lda); + const unsigned long ldab = lda * sizeof(float); + + float *c_ptr0 = C + (y * ldc); + + rows_to_compute = M-y; + if (rows_to_compute > 8) { + if (rows_to_compute % 8) { + rows_to_compute = 8 - 1; + } else { + rows_to_compute = 8; + } + } + + for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) { + const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>())); + long loops = loops_count; + long regs = regs_count; + long temp = 0; + long blocks = blocks_count; + const float *a_ptr0 = a_ptr0_base; + const float *b_ptr0 = B + (K_stride * x0); + const unsigned long ldcb = ldc * sizeof(float); + const float *biasptr = bias ? bias+x0 : nullbias; + + switch(rows_to_compute) { + case 1: + __asm __volatile ( + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "mov z1.s, #0\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "zip1 z18.s, z15.s, z15.s\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "mov z14.s, #0\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "mov z1.s, #0\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "mov z14.s, #0\n" + "zip1 z18.s, z13.s, z14.s\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z5.s, #0\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z0.d, z4.d, z5.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "mov z1.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z5.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z0.d, z4.d, z5.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "mov z1.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z5.s, #0\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + "cbz %[blocks], 5f\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp1 z1.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + break; + case 2: + __asm __volatile ( + "a_ptr1 .req X0\n" + "c_ptr1 .req X1\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "zip1 z18.s, z15.s, z15.s\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip1 z18.s, z13.s, z14.s\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "trn1 z0.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "subs %[loops], %[loops], #0x1\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "trn1 z0.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + "trn1 z0.d, z4.d, z5.d\n" + "cbz %[blocks], 5f\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "st1w z1.s, p0, [c_ptr1]\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq c_ptr1\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory" + ); + break; + case 3: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "c_ptr1 .req X2\n" + "c_ptr2 .req X3\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "mov z3.s, #0\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z20.d, z16.d\n" + "mov z21.d, z17.d\n" + "mov z22.d, z18.d\n" + "mov z23.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "mov z3.s, #0\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip1 z18.s, z13.s, z14.s\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "mov z14.s, #0\n" + "zip1 z20.s, z13.s, z14.s\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "mov z14.s, #0\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + "trn2 z8.d, z4.d, z5.d\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z7.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "trn2 z9.d, z6.d, z7.d\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z1.d, z6.d, z7.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z3.s, #0\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn2 z8.d, z4.d, z5.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z7.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "trn2 z9.d, z6.d, z7.d\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z1.d, z6.d, z7.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z3.s, #0\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z7.s, #0\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + "cbz %[blocks], 5f\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp1 z5.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory" + ); + break; + case 4: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "c_ptr1 .req X3\n" + "c_ptr2 .req X4\n" + "c_ptr3 .req X5\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z20.d, z16.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z21.d, z17.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z22.d, z18.d\n" + "mov z23.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "zip1 z20.s, z13.s, z14.s\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + "trn2 z8.d, z4.d, z5.d\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn2 z9.d, z6.d, z7.d\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z1.d, z6.d, z7.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn2 z9.d, z6.d, z7.d\n" + "addvl a_ptr3, a_ptr3, #2\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z1.d, z6.d, z7.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + "cbz %[blocks], 5f\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" + ); + break; + case 5: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "c_ptr1 .req X4\n" + "c_ptr2 .req X5\n" + "c_ptr3 .req X6\n" + "c_ptr4 .req X7\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "mov z5.s, #0\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z20.d, z16.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z21.d, z17.d\n" + "add a_ptr4, a_ptr4, #0x10\n" + "mov z22.d, z18.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z23.d, z19.d\n" + "mov z24.d, z16.d\n" + "mov z25.d, z17.d\n" + "mov z26.d, z18.d\n" + "mov z27.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "mov z5.s, #0\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr4, a_ptr4, #0x10\n" + "zip1 z20.s, z13.s, z14.s\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr4]\n" + "mov z14.s, #0\n" + "zip1 z24.s, z13.s, z14.s\n" + "zip2 z25.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" + "mov z14.s, #0\n" + "zip1 z26.s, z13.s, z14.s\n" + "zip2 z27.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z9.s, #0\n" + "add a_ptr4, a_ptr4, #0x20\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "trn2 z10.d, z8.d, z9.d\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z2.d, z8.d, z9.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z9.d, z6.d, z7.d\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z5.s, #0\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr3, a_ptr3, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z9.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "trn2 z10.d, z8.d, z9.d\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z2.d, z8.d, z9.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z9.d, z6.d, z7.d\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl a_ptr4, a_ptr4, #2\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z5.s, #0\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p6/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "addvl a_ptr4, a_ptr4, #1\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z9.s, #0\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + "cbz %[blocks], 5f\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmax z24.s, p7/m, z24.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "fmin z24.s, p7/m, z24.s, z15.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "fmax z25.s, p7/m, z25.s, z14.s\n" + "fmax z26.s, p7/m, z26.s, z14.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "fmax z27.s, p7/m, z27.s, z14.s\n" + "fmin z25.s, p7/m, z25.s, z15.s\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "fmin z26.s, p7/m, z26.s, z15.s\n" + "fmin z27.s, p7/m, z27.s, z15.s\n" + "uzp1 z8.s, z24.s, z25.s\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + "uzp1 z9.s, z26.s, z27.s\n" + "st1w z8.s, p0, [c_ptr4]\n" + "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" + ); + break; + case 6: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "c_ptr1 .req X5\n" + "c_ptr2 .req X6\n" + "c_ptr3 .req X7\n" + "c_ptr4 .req X8\n" + "c_ptr5 .req X9\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z20.d, z16.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z21.d, z17.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z22.d, z18.d\n" + "add a_ptr4, a_ptr4, #0x10\n" + "mov z23.d, z19.d\n" + "add a_ptr5, a_ptr5, #0x10\n" + "mov z24.d, z16.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z25.d, z17.d\n" + "mov z26.d, z18.d\n" + "mov z27.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "add a_ptr4, a_ptr4, #0x10\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip1 z20.s, z13.s, z14.s\n" + "add a_ptr5, a_ptr5, #0x10\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr4]\n" + "ld1w z14.s, p0/z, [c_ptr5]\n" + "zip1 z24.s, z13.s, z14.s\n" + "zip2 z25.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" + "zip1 z26.s, z13.s, z14.s\n" + "zip2 z27.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "add a_ptr4, a_ptr4, #0x20\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "add a_ptr5, a_ptr5, #0x20\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z2.d, z8.d, z9.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z9.d, z6.d, z7.d\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr3, a_ptr3, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "trn2 z10.d, z8.d, z9.d\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z2.d, z8.d, z9.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z9.d, z6.d, z7.d\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl a_ptr4, a_ptr4, #2\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr5, a_ptr5, #2\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p6/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "addvl a_ptr4, a_ptr4, #1\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p6/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl a_ptr5, a_ptr5, #1\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + "cbz %[blocks], 5f\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmax z24.s, p7/m, z24.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "fmin z24.s, p7/m, z24.s, z15.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "fmax z25.s, p7/m, z25.s, z14.s\n" + "fmax z26.s, p7/m, z26.s, z14.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "fmax z27.s, p7/m, z27.s, z14.s\n" + "fmin z25.s, p7/m, z25.s, z15.s\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "fmin z26.s, p7/m, z26.s, z15.s\n" + "fmin z27.s, p7/m, z27.s, z15.s\n" + "uzp1 z8.s, z24.s, z25.s\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + "uzp2 z9.s, z24.s, z25.s\n" + "uzp1 z10.s, z26.s, z27.s\n" + "uzp2 z11.s, z26.s, z27.s\n" + "st1w z8.s, p0, [c_ptr4]\n" + "st1w z9.s, p0, [c_ptr5]\n" + "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" + "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" + ); + break; + case 7: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "c_ptr1 .req X6\n" + "c_ptr2 .req X7\n" + "c_ptr3 .req X8\n" + "c_ptr4 .req X9\n" + "c_ptr5 .req X10\n" + "c_ptr6 .req X11\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "mov z7.s, #0\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "mov z20.d, z16.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z21.d, z17.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z22.d, z18.d\n" + "add a_ptr4, a_ptr4, #0x10\n" + "mov z23.d, z19.d\n" + "add a_ptr5, a_ptr5, #0x10\n" + "mov z24.d, z16.d\n" + "add a_ptr6, a_ptr6, #0x10\n" + "mov z25.d, z17.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z26.d, z18.d\n" + "mov z27.d, z19.d\n" + "mov z28.d, z16.d\n" + "mov z29.d, z17.d\n" + "mov z30.d, z18.d\n" + "mov z31.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "mov z7.s, #0\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "add a_ptr4, a_ptr4, #0x10\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip1 z20.s, z13.s, z14.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "add a_ptr5, a_ptr5, #0x10\n" + "add a_ptr6, a_ptr6, #0x10\n" + "zip1 z22.s, z13.s, z14.s\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr4]\n" + "ld1w z14.s, p0/z, [c_ptr5]\n" + "zip1 z24.s, z13.s, z14.s\n" + "zip2 z25.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" + "zip1 z26.s, z13.s, z14.s\n" + "zip2 z27.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr6]\n" + "mov z14.s, #0\n" + "zip1 z28.s, z13.s, z14.s\n" + "zip2 z29.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n" + "mov z14.s, #0\n" + "zip1 z30.s, z13.s, z14.s\n" + "zip2 z31.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "subs %[loops], %[loops], #0x1\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "add a_ptr4, a_ptr4, #0x20\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "add a_ptr5, a_ptr5, #0x20\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p7/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z11.s, #0\n" + "add a_ptr6, a_ptr6, #0x20\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z3.d, z10.d, z11.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z11.d, z10.d, z11.d\n" + "trn2 z10.d, z8.d, z9.d\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z7.s, #0\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p7/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z11.s, #0\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z3.d, z10.d, z11.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z11.d, z10.d, z11.d\n" + "trn2 z10.d, z8.d, z9.d\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl a_ptr4, a_ptr4, #2\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr5, a_ptr5, #2\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "addvl a_ptr6, a_ptr6, #2\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + "mov z7.s, #0\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr3, a_ptr3, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "trn1 z11.d, z6.d, z7.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p6/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p6/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "addvl a_ptr4, a_ptr4, #1\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "addvl a_ptr5, a_ptr5, #1\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p6/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z11.s, #0\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "addvl a_ptr6, a_ptr6, #1\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "trn1 z3.d, z10.d, z11.d\n" + "cbz %[blocks], 5f\n" + "trn2 z11.d, z10.d, z11.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmax z24.s, p7/m, z24.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "fmin z24.s, p7/m, z24.s, z15.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "fmax z25.s, p7/m, z25.s, z14.s\n" + "fmax z26.s, p7/m, z26.s, z14.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "fmax z27.s, p7/m, z27.s, z14.s\n" + "fmax z28.s, p7/m, z28.s, z14.s\n" + "fmin z25.s, p7/m, z25.s, z15.s\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "fmin z26.s, p7/m, z26.s, z15.s\n" + "fmin z27.s, p7/m, z27.s, z15.s\n" + "fmin z28.s, p7/m, z28.s, z15.s\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + "uzp1 z8.s, z24.s, z25.s\n" + "uzp2 z9.s, z24.s, z25.s\n" + "uzp1 z10.s, z26.s, z27.s\n" + "uzp2 z11.s, z26.s, z27.s\n" + "st1w z8.s, p0, [c_ptr4]\n" + "fmax z29.s, p7/m, z29.s, z14.s\n" + "fmax z30.s, p7/m, z30.s, z14.s\n" + "fmax z31.s, p7/m, z31.s, z14.s\n" + "st1w z9.s, p0, [c_ptr5]\n" + "fmin z29.s, p7/m, z29.s, z15.s\n" + "fmin z30.s, p7/m, z30.s, z15.s\n" + "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" + "fmin z31.s, p7/m, z31.s, z15.s\n" + "uzp1 z12.s, z28.s, z29.s\n" + "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" + "uzp1 z13.s, z30.s, z31.s\n" + "st1w z12.s, p0, [c_ptr6]\n" + "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory" + ); + break; + default: + case 8: + __asm __volatile ( + "a_ptr1 .req X0\n" + "a_ptr2 .req X1\n" + "a_ptr3 .req X2\n" + "a_ptr4 .req X3\n" + "a_ptr5 .req X4\n" + "a_ptr6 .req X5\n" + "a_ptr7 .req X6\n" + "c_ptr1 .req X7\n" + "c_ptr2 .req X8\n" + "c_ptr3 .req X9\n" + "c_ptr4 .req X10\n" + "c_ptr5 .req X11\n" + "c_ptr6 .req X12\n" + "c_ptr7 .req X13\n" + "add a_ptr1, %[a_ptr0], %[lda]\n" + "add c_ptr1, %[c_ptr0], %[ldc]\n" + "add a_ptr2, a_ptr1, %[lda]\n" + "add c_ptr2, c_ptr1, %[ldc]\n" + "add a_ptr3, a_ptr2, %[lda]\n" + "add c_ptr3, c_ptr2, %[ldc]\n" + "add a_ptr4, a_ptr3, %[lda]\n" + "add c_ptr4, c_ptr3, %[ldc]\n" + "add a_ptr5, a_ptr4, %[lda]\n" + "add c_ptr5, c_ptr4, %[ldc]\n" + "add a_ptr6, a_ptr5, %[lda]\n" + "add c_ptr6, c_ptr5, %[ldc]\n" + "add a_ptr7, a_ptr6, %[lda]\n" + "add c_ptr7, c_ptr6, %[ldc]\n" + "whilelt p6.s, %[temp], %[leftovers]\n" + "whilelt p0.s, %[temp], %[width]\n" + "incw %[temp], all, mul #1\n" + "ptrue p7.s\n" + "whilelt p1.s, %[temp], %[width]\n" + "cbnz %[append], 1f\n" + "ld1w z15.s, p0/z, [%[biasptr]]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z15.s, z15.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z15.s, z15.s\n" + "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "zip1 z18.s, z15.s, z15.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z15.s, z15.s\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "mov z20.d, z16.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "mov z21.d, z17.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "mov z22.d, z18.d\n" + "add a_ptr3, a_ptr3, #0x10\n" + "mov z23.d, z19.d\n" + "add a_ptr4, a_ptr4, #0x10\n" + "mov z24.d, z16.d\n" + "add a_ptr5, a_ptr5, #0x10\n" + "mov z25.d, z17.d\n" + "add a_ptr6, a_ptr6, #0x10\n" + "mov z26.d, z18.d\n" + "add a_ptr7, a_ptr7, #0x10\n" + "mov z27.d, z19.d\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "mov z28.d, z16.d\n" + "mov z29.d, z17.d\n" + "mov z30.d, z18.d\n" + "mov z31.d, z19.d\n" + "cbz %[loops], 2f\n" + "b 3f\n" + "1:\n" + "ld1w z13.s, p0/z, [%[c_ptr0]]\n" + "ld1w z14.s, p0/z, [c_ptr1]\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n" + "add %[a_ptr0], %[a_ptr0], #0x10\n" + "ld1rqw z1.s, p7/z, [a_ptr1]\n" + "add a_ptr1, a_ptr1, #0x10\n" + "zip1 z16.s, z13.s, z14.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2]\n" + "zip2 z17.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n" + "trn1 z8.d, z0.d, z1.d\n" + "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n" + "ld1rqw z3.s, p7/z, [a_ptr3]\n" + "add a_ptr2, a_ptr2, #0x10\n" + "ld1rqw z4.s, p7/z, [a_ptr4]\n" + "add a_ptr3, a_ptr3, #0x10\n" + "zip1 z18.s, z13.s, z14.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5]\n" + "zip2 z19.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr2]\n" + "trn1 z9.d, z2.d, z3.d\n" + "ld1w z14.s, p0/z, [c_ptr3]\n" + "ld1rqw z6.s, p7/z, [a_ptr6]\n" + "add a_ptr4, a_ptr4, #0x10\n" + "trn1 z10.d, z4.d, z5.d\n" + "ld1rqw z7.s, p7/z, [a_ptr7]\n" + "zip1 z20.s, z13.s, z14.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "zip2 z21.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n" + "add a_ptr5, a_ptr5, #0x10\n" + "trn1 z11.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + "add a_ptr6, a_ptr6, #0x10\n" + "zip1 z22.s, z13.s, z14.s\n" + "add a_ptr7, a_ptr7, #0x10\n" + "zip2 z23.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr4]\n" + "ld1w z14.s, p0/z, [c_ptr5]\n" + "zip1 z24.s, z13.s, z14.s\n" + "zip2 z25.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n" + "zip1 z26.s, z13.s, z14.s\n" + "zip2 z27.s, z13.s, z14.s\n" + "ld1w z13.s, p0/z, [c_ptr6]\n" + "ld1w z14.s, p0/z, [c_ptr7]\n" + "zip1 z28.s, z13.s, z14.s\n" + "zip2 z29.s, z13.s, z14.s\n" + "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n" + "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n" + "zip1 z30.s, z13.s, z14.s\n" + "zip2 z31.s, z13.s, z14.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + "cbz %[loops], 2f\n" + "3:\n" + "trn2 z0.d, z0.d, z1.d\n" + "subs %[loops], %[loops], #0x1\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "add a_ptr1, a_ptr1, #0x20\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "add a_ptr2, a_ptr2, #0x20\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "add a_ptr3, a_ptr3, #0x20\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "add a_ptr4, a_ptr4, #0x20\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "add a_ptr5, a_ptr5, #0x20\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p7/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z11.s, p7/z, [a_ptr7]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "add a_ptr6, a_ptr6, #0x20\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "add a_ptr7, a_ptr7, #0x20\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z3.d, z10.d, z11.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z11.d, z10.d, z11.d\n" + "trn2 z10.d, z8.d, z9.d\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1rqw z7.s, p7/z, [a_ptr7, #-0x10]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n" + "trn1 z11.d, z6.d, z7.d\n" + "b.ne 3b\n" + "2:\n" + "cbz %[regs], 4f\n" + "trn2 z0.d, z0.d, z1.d\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p7/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p7/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p7/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z11.s, p7/z, [a_ptr7]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + "trn1 z3.d, z10.d, z11.d\n" + "addvl %[b_ptr0], %[b_ptr0], #16\n" + "trn2 z11.d, z10.d, z11.d\n" + "trn2 z10.d, z8.d, z9.d\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr4, a_ptr4, #2\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + "addvl a_ptr5, a_ptr5, #2\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + "addvl a_ptr6, a_ptr6, #2\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + "addvl a_ptr1, a_ptr1, #2\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + "addvl a_ptr7, a_ptr7, #2\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "addvl %[b_ptr0], %[b_ptr0], #-4\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl a_ptr2, a_ptr2, #2\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "addvl a_ptr3, a_ptr3, #2\n" + "trn1 z8.d, z0.d, z1.d\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "trn1 z9.d, z2.d, z3.d\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "trn1 z10.d, z4.d, z5.d\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "trn1 z11.d, z6.d, z7.d\n" + "cbz %[blocks], 5f\n" + "trn2 z0.d, z0.d, z1.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z1.d, z2.d, z3.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "b 5f\n" + "4:\n" + "trn2 z0.d, z0.d, z1.d\n" + "trn2 z1.d, z2.d, z3.d\n" + "trn2 z2.d, z4.d, z5.d\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "trn2 z3.d, z6.d, z7.d\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + "ld1rqw z8.s, p6/z, [a_ptr4]\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + "addvl a_ptr1, a_ptr1, #1\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + "addvl a_ptr2, a_ptr2, #1\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + "addvl a_ptr3, a_ptr3, #1\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + "ld1rqw z9.s, p6/z, [a_ptr5]\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + "addvl a_ptr4, a_ptr4, #1\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + "addvl a_ptr5, a_ptr5, #1\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + "ld1rqw z10.s, p6/z, [a_ptr6]\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "ld1rqw z11.s, p6/z, [a_ptr7]\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + "addvl %[b_ptr0], %[b_ptr0], #4\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + "addvl a_ptr6, a_ptr6, #1\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + "addvl a_ptr7, a_ptr7, #1\n" + "trn1 z0.d, z4.d, z5.d\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + "trn1 z1.d, z6.d, z7.d\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + "trn1 z2.d, z8.d, z9.d\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "trn1 z3.d, z10.d, z11.d\n" + "cbz %[blocks], 5f\n" + "trn2 z11.d, z10.d, z11.d\n" + "ld1w z12.s, p7/z, [%[b_ptr0]]\n" + "trn2 z10.d, z8.d, z9.d\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n" + "trn2 z9.d, z6.d, z7.d\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n" + "trn2 z8.d, z4.d, z5.d\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n" + ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n" + "subs %[blocks], %[blocks], #0x1\n" + ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n" + ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n" + ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n" + ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n" + ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n" + ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n" + ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n" + ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n" + ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n" + ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n" + ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n" + ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n" + ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n" + ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n" + ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n" + "b.eq 5f\n" + "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n" + "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n" + "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n" + "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n" + ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n" + ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n" + ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n" + ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n" + ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n" + ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n" + ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n" + ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n" + ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n" + ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n" + ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n" + ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n" + ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n" + ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n" + ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n" + ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n" + "5:\n" + "ld1rw z14.s, p7/z, [%[minptr]]\n" + "ld1rw z15.s, p7/z, [%[maxptr]]\n" + "fmax z16.s, p7/m, z16.s, z14.s\n" + "fmax z17.s, p7/m, z17.s, z14.s\n" + "fmax z18.s, p7/m, z18.s, z14.s\n" + "fmax z19.s, p7/m, z19.s, z14.s\n" + "fmin z16.s, p7/m, z16.s, z15.s\n" + "fmin z17.s, p7/m, z17.s, z15.s\n" + "fmin z18.s, p7/m, z18.s, z15.s\n" + "fmin z19.s, p7/m, z19.s, z15.s\n" + "fmax z20.s, p7/m, z20.s, z14.s\n" + "uzp1 z0.s, z16.s, z17.s\n" + "uzp2 z1.s, z16.s, z17.s\n" + "uzp1 z2.s, z18.s, z19.s\n" + "uzp2 z3.s, z18.s, z19.s\n" + "st1w z0.s, p0, [%[c_ptr0]]\n" + "fmin z20.s, p7/m, z20.s, z15.s\n" + "fmax z21.s, p7/m, z21.s, z14.s\n" + "fmax z22.s, p7/m, z22.s, z14.s\n" + "st1w z1.s, p0, [c_ptr1]\n" + "fmax z23.s, p7/m, z23.s, z14.s\n" + "fmax z24.s, p7/m, z24.s, z14.s\n" + "fmin z21.s, p7/m, z21.s, z15.s\n" + "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n" + "fmin z22.s, p7/m, z22.s, z15.s\n" + "addvl %[c_ptr0], %[c_ptr0], #2\n" + "fmin z23.s, p7/m, z23.s, z15.s\n" + "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n" + "uzp1 z4.s, z20.s, z21.s\n" + "uzp2 z5.s, z20.s, z21.s\n" + "fmin z24.s, p7/m, z24.s, z15.s\n" + "uzp1 z6.s, z22.s, z23.s\n" + "st1w z4.s, p0, [c_ptr2]\n" + "uzp2 z7.s, z22.s, z23.s\n" + "fmax z25.s, p7/m, z25.s, z14.s\n" + "fmax z26.s, p7/m, z26.s, z14.s\n" + "st1w z5.s, p0, [c_ptr3]\n" + "fmax z27.s, p7/m, z27.s, z14.s\n" + "fmax z28.s, p7/m, z28.s, z14.s\n" + "fmin z25.s, p7/m, z25.s, z15.s\n" + "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n" + "fmin z26.s, p7/m, z26.s, z15.s\n" + "fmin z27.s, p7/m, z27.s, z15.s\n" + "fmin z28.s, p7/m, z28.s, z15.s\n" + "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n" + "uzp1 z8.s, z24.s, z25.s\n" + "uzp2 z9.s, z24.s, z25.s\n" + "uzp1 z10.s, z26.s, z27.s\n" + "uzp2 z11.s, z26.s, z27.s\n" + "st1w z8.s, p0, [c_ptr4]\n" + "fmax z29.s, p7/m, z29.s, z14.s\n" + "fmax z30.s, p7/m, z30.s, z14.s\n" + "fmax z31.s, p7/m, z31.s, z14.s\n" + "st1w z9.s, p0, [c_ptr5]\n" + "fmin z29.s, p7/m, z29.s, z15.s\n" + "fmin z30.s, p7/m, z30.s, z15.s\n" + "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n" + "fmin z31.s, p7/m, z31.s, z15.s\n" + "uzp1 z12.s, z28.s, z29.s\n" + "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n" + "uzp2 z13.s, z28.s, z29.s\n" + "uzp1 z14.s, z30.s, z31.s\n" + "uzp2 z15.s, z30.s, z31.s\n" + "st1w z12.s, p0, [c_ptr6]\n" + "st1w z13.s, p0, [c_ptr7]\n" + "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n" + "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n" + ".unreq a_ptr1\n" + ".unreq a_ptr2\n" + ".unreq a_ptr3\n" + ".unreq a_ptr4\n" + ".unreq a_ptr5\n" + ".unreq a_ptr6\n" + ".unreq a_ptr7\n" + ".unreq c_ptr1\n" + ".unreq c_ptr2\n" + ".unreq c_ptr3\n" + ".unreq c_ptr4\n" + ".unreq c_ptr5\n" + ".unreq c_ptr6\n" + ".unreq c_ptr7\n" + : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks) + : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers) + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory" + ); + break; + } + + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp index d8422105cc..230a2cf19f 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_s8s32_dot_4VLx4; - hybrid_s8s32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_s8s32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp index aa3a764dec..46fc500476 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp @@ -32,7 +32,7 @@ namespace arm_gemm { -void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool append) { +void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) { const int K_stride = ((K + 3) / 4) * 4; const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; @@ -41,12 +41,23 @@ void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32 const long leftovers = K; const long blocks_count = (K + 3) / 4; - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const int8_t * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(int8_t); int32_t *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>())); long loops = loops_count; @@ -57,7 +68,7 @@ void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32 const int8_t *b_ptr0 = B + (K_stride * x0); const unsigned long ldcb = ldc * sizeof(int32_t); - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.b, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp index 5dab1da135..f829fb0205 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_hybrid_u8u32_dot_4VLx4; - hybrid_u8u32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + hybrid_u8u32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp index 4fb7e825b5..13614700e3 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp @@ -32,7 +32,7 @@ namespace arm_gemm { -void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool append) { +void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) { const int K_stride = ((K + 3) / 4) * 4; const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; @@ -41,12 +41,23 @@ void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uin const long leftovers = K; const long blocks_count = (K + 3) / 4; - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const uint8_t * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(uint8_t); uint32_t *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>())); long loops = loops_count; @@ -57,7 +68,7 @@ void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uin const uint8_t *b_ptr0 = B + (K_stride * x0); const unsigned long ldcb = ldc * sizeof(uint32_t); - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.b, %[temp], %[leftovers]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp index a3434c1504..43107e45fa 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_bf16fp32_dot_3VLx8; - interleaved_bf16fp32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_bf16fp32_dot_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp index 65841581aa..7e20ed0971 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,13 +61,11 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 * "mov z15.s, #0\n" "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n" "mov z16.s, #0\n" - "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z17.s, #0\n" - "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n" + "addvl %[b_ptr], %[b_ptr], #3\n" "mov z18.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "mov z19.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #3\n" "mov z20.s, #0\n" "mov z21.s, #0\n" "mov z22.s, #0\n" @@ -83,9 +81,11 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 * "cbz %[loops], 1f\n" "2:\n" ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n" + "subs %[loops], %[loops], #0x1\n" ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n" ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n" ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n" @@ -141,13 +141,13 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 * ".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n" ".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n" ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n" - "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" - "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n" + "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n" ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n" ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n" @@ -235,9 +235,11 @@ void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 * "b 4f\n" "3:\n" ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n" - "addvl %[b_ptr], %[b_ptr], #3\n" + "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n" + "addvl %[b_ptr], %[b_ptr], #3\n" ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n" ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n" ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp index c6ffc047fd..f1353e2086 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_bf16fp32_mmla_3VLx8; - interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp index 528fc72005..16cc69b2a6 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,13 +63,11 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 "mov z16.s, #0\n" "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n" "mov z17.s, #0\n" - "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z18.s, #0\n" - "ld1h z7.h, p0/z, [%[b_ptr], #3, MUL VL]\n" + "addvl %[b_ptr], %[b_ptr], #4\n" "mov z19.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "mov z20.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #4\n" "mov z21.s, #0\n" "mov z22.s, #0\n" "mov z23.s, #0\n" @@ -84,12 +82,14 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 "cbz %[loops], 1f\n" "2:\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - "subs %[loops], %[loops], #0x1\n" + "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n" ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" "ld1h z4.h, p0/z, [%[b_ptr]]\n" - ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n" ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n" ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n" ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n" @@ -152,18 +152,18 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n" "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n" ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n" - "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" - "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" + "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" - "ld1h z4.h, p0/z, [%[b_ptr]]\n" ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n" ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n" + ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" + "ld1h z4.h, p0/z, [%[b_ptr]]\n" ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n" ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n" "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n" @@ -269,15 +269,17 @@ void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 "b 4f\n" "3:\n" ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n" - "addvl %[b_ptr], %[b_ptr], #8\n" + "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n" - ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" + "add %[a_ptr], %[a_ptr], #0x40\n" ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n" - "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n" + "addvl %[b_ptr], %[b_ptr], #8\n" + ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n" ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n" ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n" + "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n" ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n" "ld1h z5.h, p0/z, [%[b_ptr], #-7, MUL VL]\n" ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp index 10dbdd8847..816c0cd095 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_fp16_mla_3VLx8; - interleaved_fp16_mla_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_fp16_mla_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp index b2d3a6f52e..f2050cbd56 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,22 +50,22 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "mov z9.h, #0\n" "mov z10.h, #0\n" "mov z11.h, #0\n" - "mov z12.h, #0\n" "ld1rqh z0.h, p0/z, [%[a_ptr]]\n" - "mov z13.h, #0\n" + "mov z12.h, #0\n" "ld1h z2.h, p0/z, [%[b_ptr]]\n" - "mov z14.h, #0\n" + "mov z13.h, #0\n" "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n" - "mov z15.h, #0\n" + "mov z14.h, #0\n" "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n" - "mov z16.h, #0\n" + "mov z15.h, #0\n" "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n" - "mov z17.h, #0\n" + "mov z16.h, #0\n" "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n" - "mov z18.h, #0\n" + "mov z17.h, #0\n" "add %[a_ptr], %[a_ptr], #0x20\n" - "mov z19.h, #0\n" + "mov z18.h, #0\n" "addvl %[b_ptr], %[b_ptr], #6\n" + "mov z19.h, #0\n" "mov z20.h, #0\n" "mov z21.h, #0\n" "mov z22.h, #0\n" @@ -202,8 +202,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z9.h, z2.h, z0.h[1]\n" "fmla z10.h, z2.h, z0.h[2]\n" "fmla z11.h, z2.h, z0.h[3]\n" - "fmla z12.h, z2.h, z0.h[4]\n" "st1h z8.h, p0, [%[c_ptr]]\n" + "fmla z12.h, z2.h, z0.h[4]\n" "fmla z13.h, z2.h, z0.h[5]\n" "fmla z14.h, z2.h, z0.h[6]\n" "fmla z15.h, z2.h, z0.h[7]\n" @@ -211,8 +211,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z17.h, z3.h, z0.h[1]\n" "fmla z18.h, z3.h, z0.h[2]\n" "fmla z19.h, z3.h, z0.h[3]\n" - "fmla z20.h, z3.h, z0.h[4]\n" "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" + "fmla z20.h, z3.h, z0.h[4]\n" "fmla z21.h, z3.h, z0.h[5]\n" "fmla z22.h, z3.h, z0.h[6]\n" "fmla z23.h, z3.h, z0.h[7]\n" @@ -220,10 +220,11 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z25.h, z4.h, z0.h[1]\n" "fmla z26.h, z4.h, z0.h[2]\n" "fmla z27.h, z4.h, z0.h[3]\n" - "fmla z28.h, z4.h, z0.h[4]\n" "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" + "fmla z28.h, z4.h, z0.h[4]\n" "fmla z29.h, z4.h, z0.h[5]\n" "fmla z30.h, z4.h, z0.h[6]\n" + "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z31.h, z4.h, z0.h[7]\n" "b 4f\n" "3:\n" @@ -257,8 +258,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z9.h, z5.h, z1.h[1]\n" "fmla z10.h, z5.h, z1.h[2]\n" "fmla z11.h, z5.h, z1.h[3]\n" - "fmla z12.h, z5.h, z1.h[4]\n" "st1h z8.h, p0, [%[c_ptr]]\n" + "fmla z12.h, z5.h, z1.h[4]\n" "fmla z13.h, z5.h, z1.h[5]\n" "fmla z14.h, z5.h, z1.h[6]\n" "fmla z15.h, z5.h, z1.h[7]\n" @@ -266,8 +267,8 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z17.h, z6.h, z1.h[1]\n" "fmla z18.h, z6.h, z1.h[2]\n" "fmla z19.h, z6.h, z1.h[3]\n" - "fmla z20.h, z6.h, z1.h[4]\n" "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n" + "fmla z20.h, z6.h, z1.h[4]\n" "fmla z21.h, z6.h, z1.h[5]\n" "fmla z22.h, z6.h, z1.h[6]\n" "fmla z23.h, z6.h, z1.h[7]\n" @@ -275,13 +276,13 @@ void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, "fmla z25.h, z7.h, z1.h[1]\n" "fmla z26.h, z7.h, z1.h[2]\n" "fmla z27.h, z7.h, z1.h[3]\n" - "fmla z28.h, z7.h, z1.h[4]\n" "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n" + "fmla z28.h, z7.h, z1.h[4]\n" "fmla z29.h, z7.h, z1.h[5]\n" "fmla z30.h, z7.h, z1.h[6]\n" + "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z31.h, z7.h, z1.h[7]\n" "4:\n" - "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n" "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n" "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n" "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp index cdc9447701..cce90fb135 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_fp32_mla_3VLx8; - interleaved_fp32_mla_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_fp32_mla_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp index d26948a0d4..cd178c478a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,20 +50,20 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "mov z9.s, #0\n" "mov z10.s, #0\n" "mov z11.s, #0\n" - "mov z12.s, #0\n" "ld1rqw z0.s, p0/z, [%[a_ptr]]\n" - "mov z13.s, #0\n" + "mov z12.s, #0\n" "ld1w z4.s, p0/z, [%[b_ptr]]\n" - "mov z14.s, #0\n" + "mov z13.s, #0\n" "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n" - "mov z15.s, #0\n" + "mov z14.s, #0\n" "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n" - "mov z16.s, #0\n" + "mov z15.s, #0\n" "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n" - "mov z17.s, #0\n" + "mov z16.s, #0\n" "add %[a_ptr], %[a_ptr], #0x40\n" - "mov z18.s, #0\n" + "mov z17.s, #0\n" "addvl %[b_ptr], %[b_ptr], #3\n" + "mov z18.s, #0\n" "mov z19.s, #0\n" "mov z20.s, #0\n" "mov z21.s, #0\n" @@ -207,8 +207,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z9.s, z4.s, z0.s[1]\n" "fmla z10.s, z4.s, z0.s[2]\n" "fmla z11.s, z4.s, z0.s[3]\n" - "fmla z20.s, z4.s, z1.s[0]\n" "st1w z8.s, p0, [%[c_ptr]]\n" + "fmla z20.s, z4.s, z1.s[0]\n" "fmla z21.s, z4.s, z1.s[1]\n" "fmla z22.s, z4.s, z1.s[2]\n" "fmla z23.s, z4.s, z1.s[3]\n" @@ -216,8 +216,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z13.s, z5.s, z0.s[1]\n" "fmla z14.s, z5.s, z0.s[2]\n" "fmla z15.s, z5.s, z0.s[3]\n" - "fmla z24.s, z5.s, z1.s[0]\n" "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" + "fmla z24.s, z5.s, z1.s[0]\n" "fmla z25.s, z5.s, z1.s[1]\n" "fmla z26.s, z5.s, z1.s[2]\n" "fmla z27.s, z5.s, z1.s[3]\n" @@ -225,10 +225,11 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z17.s, z6.s, z0.s[1]\n" "fmla z18.s, z6.s, z0.s[2]\n" "fmla z19.s, z6.s, z0.s[3]\n" - "fmla z28.s, z6.s, z1.s[0]\n" "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" + "fmla z28.s, z6.s, z1.s[0]\n" "fmla z29.s, z6.s, z1.s[1]\n" "fmla z30.s, z6.s, z1.s[2]\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z31.s, z6.s, z1.s[3]\n" "b 4f\n" "3:\n" @@ -266,8 +267,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z9.s, z4.s, z2.s[1]\n" "fmla z10.s, z4.s, z2.s[2]\n" "fmla z11.s, z4.s, z2.s[3]\n" - "fmla z20.s, z4.s, z3.s[0]\n" "st1w z8.s, p0, [%[c_ptr]]\n" + "fmla z20.s, z4.s, z3.s[0]\n" "fmla z21.s, z4.s, z3.s[1]\n" "fmla z22.s, z4.s, z3.s[2]\n" "fmla z23.s, z4.s, z3.s[3]\n" @@ -275,8 +276,8 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z13.s, z5.s, z2.s[1]\n" "fmla z14.s, z5.s, z2.s[2]\n" "fmla z15.s, z5.s, z2.s[3]\n" - "fmla z24.s, z5.s, z3.s[0]\n" "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" + "fmla z24.s, z5.s, z3.s[0]\n" "fmla z25.s, z5.s, z3.s[1]\n" "fmla z26.s, z5.s, z3.s[2]\n" "fmla z27.s, z5.s, z3.s[3]\n" @@ -284,13 +285,13 @@ void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, fl "fmla z17.s, z6.s, z2.s[1]\n" "fmla z18.s, z6.s, z2.s[2]\n" "fmla z19.s, z6.s, z2.s[3]\n" - "fmla z28.s, z6.s, z3.s[0]\n" "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" + "fmla z28.s, z6.s, z3.s[0]\n" "fmla z29.s, z6.s, z3.s[1]\n" "fmla z30.s, z6.s, z3.s[2]\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "fmla z31.s, z6.s, z3.s[3]\n" "4:\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp new file mode 100644 index 0000000000..4ca43cd5c9 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __ARM_FEATURE_SVE + + +#include "../std_transforms_sve.hpp" + +namespace arm_gemm { + +// Actual kernel implementations +void sve_interleaved_fp32_mmla_3VLx8(const float *, const float *, float *, int, int, int); + +class interleaved_fp32_mmla_3VLx8 { +public: + typedef float operand_type; + typedef float result_type; + + typedef void (*kern_type)(const float *, const float *, float *, int, int, int); + + /* Kernel blocking parameters */ + static unsigned int out_width() + { + return get_vector_length<float>() * 3; + } + + static unsigned int out_height() + { + return 8; + } + + static unsigned int k_unroll() + { + return 2; + } + + // Use the standard fixed size transforms. + StdTransformsSVE<operand_type, result_type, 8, 6, 2, 2> transforms = {}; + + kern_type kernel=sve_interleaved_fp32_mmla_3VLx8; + + interleaved_fp32_mmla_3VLx8(const CPUInfo *) + { + + } +}; + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp new file mode 100644 index 0000000000..a404ae9c82 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_SVE + + +#include "../../asmlib.hpp" + +namespace arm_gemm { + +void sve_interleaved_fp32_mmla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + K /= 2; + const long loops_count = (K / 2) - 1; + const long tails_count = K % 2; + + for (int yb=0; yb<ablocks; yb++) { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + long loops = loops_count; + long tails = tails_count; + + __asm __volatile ( + "mov z8.s, #0\n" + "ptrue p0.s\n" + "mov z9.s, #0\n" + "mov z10.s, #0\n" + "mov z11.s, #0\n" + "ld1rqw z0.s, p0/z, [%[a_ptr]]\n" + "mov z12.s, #0\n" + "ld1w z4.s, p0/z, [%[b_ptr]]\n" + "mov z13.s, #0\n" + "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n" + "mov z14.s, #0\n" + "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n" + "mov z15.s, #0\n" + "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n" + "mov z16.s, #0\n" + "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n" + "mov z17.s, #0\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + "mov z18.s, #0\n" + "addvl %[b_ptr], %[b_ptr], #4\n" + "mov z19.s, #0\n" + "mov z20.s, #0\n" + "mov z21.s, #0\n" + "mov z22.s, #0\n" + "mov z23.s, #0\n" + "mov z24.s, #0\n" + "mov z25.s, #0\n" + "mov z26.s, #0\n" + "mov z27.s, #0\n" + "mov z28.s, #0\n" + "mov z29.s, #0\n" + "mov z30.s, #0\n" + "mov z31.s, #0\n" + "cbz %[loops], 1f\n" + "2:\n" + ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n" + ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n" + "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n" + ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n" + ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n" + "ld1w z4.s, p0/z, [%[b_ptr]]\n" + ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n" + ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n" + ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n" + "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n" + ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n" + ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n" + ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n" + ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n" + "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n" + ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n" + ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n" + ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n" + ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #3, MUL VL]\n" + ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n" + ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n" + ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n" + ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n" + "ld1w z4.s, p0/z, [%[b_ptr], #4, MUL VL]\n" + ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n" + "ld1rqw z0.s, p0/z, [%[a_ptr]]\n" + ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n" + "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n" + ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n" + "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n" + ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n" + "ld1w z5.s, p0/z, [%[b_ptr], #5, MUL VL]\n" + ".inst 0x64a6e408 // fmmla z8.s, z0.s, z6.s\n" + "ld1rqw z3.s, p0/z, [%[a_ptr], #0x30]\n" + ".inst 0x64a6e42e // fmmla z14.s, z1.s, z6.s\n" + "add %[a_ptr], %[a_ptr], #0x80\n" + ".inst 0x64a6e454 // fmmla z20.s, z2.s, z6.s\n" + "addvl %[b_ptr], %[b_ptr], #12\n" + ".inst 0x64a6e47a // fmmla z26.s, z3.s, z6.s\n" + ".inst 0x64a7e409 // fmmla z9.s, z0.s, z7.s\n" + ".inst 0x64a7e42f // fmmla z15.s, z1.s, z7.s\n" + "ld1w z6.s, p0/z, [%[b_ptr], #-6, MUL VL]\n" + ".inst 0x64a7e455 // fmmla z21.s, z2.s, z7.s\n" + ".inst 0x64a7e47b // fmmla z27.s, z3.s, z7.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #-5, MUL VL]\n" + ".inst 0x64a4e40a // fmmla z10.s, z0.s, z4.s\n" + ".inst 0x64a4e430 // fmmla z16.s, z1.s, z4.s\n" + ".inst 0x64a4e456 // fmmla z22.s, z2.s, z4.s\n" + ".inst 0x64a4e47c // fmmla z28.s, z3.s, z4.s\n" + "ld1w z4.s, p0/z, [%[b_ptr], #-4, MUL VL]\n" + ".inst 0x64a5e40b // fmmla z11.s, z0.s, z5.s\n" + ".inst 0x64a5e431 // fmmla z17.s, z1.s, z5.s\n" + ".inst 0x64a5e457 // fmmla z23.s, z2.s, z5.s\n" + ".inst 0x64a5e47d // fmmla z29.s, z3.s, z5.s\n" + "ld1w z5.s, p0/z, [%[b_ptr], #-3, MUL VL]\n" + ".inst 0x64a6e40c // fmmla z12.s, z0.s, z6.s\n" + ".inst 0x64a6e432 // fmmla z18.s, z1.s, z6.s\n" + ".inst 0x64a6e458 // fmmla z24.s, z2.s, z6.s\n" + ".inst 0x64a6e47e // fmmla z30.s, z3.s, z6.s\n" + "ld1w z6.s, p0/z, [%[b_ptr], #-2, MUL VL]\n" + ".inst 0x64a7e40d // fmmla z13.s, z0.s, z7.s\n" + "ld1rqw z0.s, p0/z, [%[a_ptr], #-0x40]\n" + ".inst 0x64a7e433 // fmmla z19.s, z1.s, z7.s\n" + "ld1rqw z1.s, p0/z, [%[a_ptr], #-0x30]\n" + ".inst 0x64a7e459 // fmmla z25.s, z2.s, z7.s\n" + "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n" + ".inst 0x64a7e47f // fmmla z31.s, z3.s, z7.s\n" + "b.ne 2b\n" + "1:\n" + "cbz %[tails], 3f\n" + ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n" + ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n" + "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n" + ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n" + ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n" + ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n" + ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n" + "ld1w z4.s, p0/z, [%[b_ptr]]\n" + ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n" + ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n" + "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n" + ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n" + ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n" + ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n" + ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n" + "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n" + ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n" + ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n" + ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n" + ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #3, MUL VL]\n" + ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n" + ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n" + ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n" + ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n" + "ld1w z4.s, p0/z, [%[b_ptr], #4, MUL VL]\n" + ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n" + "ld1rqw z0.s, p0/z, [%[a_ptr]]\n" + ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n" + "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n" + ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n" + "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n" + ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n" + "ld1w z5.s, p0/z, [%[b_ptr], #5, MUL VL]\n" + ".inst 0x64a6e408 // fmmla z8.s, z0.s, z6.s\n" + "ld1rqw z3.s, p0/z, [%[a_ptr], #0x30]\n" + ".inst 0x64a6e42e // fmmla z14.s, z1.s, z6.s\n" + "add %[a_ptr], %[a_ptr], #0x80\n" + ".inst 0x64a6e454 // fmmla z20.s, z2.s, z6.s\n" + "addvl %[b_ptr], %[b_ptr], #14\n" + ".inst 0x64a6e47a // fmmla z26.s, z3.s, z6.s\n" + ".inst 0x64a7e409 // fmmla z9.s, z0.s, z7.s\n" + ".inst 0x64a7e42f // fmmla z15.s, z1.s, z7.s\n" + "ld1w z6.s, p0/z, [%[b_ptr], #-8, MUL VL]\n" + ".inst 0x64a7e455 // fmmla z21.s, z2.s, z7.s\n" + ".inst 0x64a7e47b // fmmla z27.s, z3.s, z7.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #-7, MUL VL]\n" + ".inst 0x64a4e40a // fmmla z10.s, z0.s, z4.s\n" + ".inst 0x64a4e430 // fmmla z16.s, z1.s, z4.s\n" + ".inst 0x64a4e456 // fmmla z22.s, z2.s, z4.s\n" + ".inst 0x64a4e47c // fmmla z28.s, z3.s, z4.s\n" + "ld1w z4.s, p0/z, [%[b_ptr], #-6, MUL VL]\n" + ".inst 0x64a5e40b // fmmla z11.s, z0.s, z5.s\n" + ".inst 0x64a5e431 // fmmla z17.s, z1.s, z5.s\n" + ".inst 0x64a5e457 // fmmla z23.s, z2.s, z5.s\n" + ".inst 0x64a5e47d // fmmla z29.s, z3.s, z5.s\n" + "ld1w z5.s, p0/z, [%[b_ptr], #-5, MUL VL]\n" + ".inst 0x64a6e40c // fmmla z12.s, z0.s, z6.s\n" + ".inst 0x64a6e432 // fmmla z18.s, z1.s, z6.s\n" + ".inst 0x64a6e458 // fmmla z24.s, z2.s, z6.s\n" + ".inst 0x64a6e47e // fmmla z30.s, z3.s, z6.s\n" + "ld1w z6.s, p0/z, [%[b_ptr], #-4, MUL VL]\n" + ".inst 0x64a7e40d // fmmla z13.s, z0.s, z7.s\n" + "ld1rqw z0.s, p0/z, [%[a_ptr], #-0x40]\n" + ".inst 0x64a7e433 // fmmla z19.s, z1.s, z7.s\n" + "ld1rqw z1.s, p0/z, [%[a_ptr], #-0x30]\n" + ".inst 0x64a7e459 // fmmla z25.s, z2.s, z7.s\n" + "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n" + ".inst 0x64a7e47f // fmmla z31.s, z3.s, z7.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #-3, MUL VL]\n" + ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n" + "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n" + ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n" + ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n" + ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n" + ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n" + "ld1w z4.s, p0/z, [%[b_ptr], #-2, MUL VL]\n" + ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n" + ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n" + ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n" + "ld1w z5.s, p0/z, [%[b_ptr], #-1, MUL VL]\n" + ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n" + ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n" + ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n" + ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n" + "uzp1 z6.d, z14.d, z15.d\n" + ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n" + ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n" + ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n" + ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n" + ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n" + "uzp1 z7.d, z16.d, z17.d\n" + ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n" + ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n" + ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n" + "uzp2 z4.d, z10.d, z11.d\n" + ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n" + "uzp1 z0.d, z8.d, z9.d\n" + ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n" + "uzp1 z1.d, z10.d, z11.d\n" + ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n" + "st1w z0.s, p0, [%[c_ptr]]\n" + "uzp1 z2.d, z12.d, z13.d\n" + "uzp1 z0.d, z18.d, z19.d\n" + ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n" + "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n" + "uzp2 z3.d, z8.d, z9.d\n" + "uzp2 z5.d, z12.d, z13.d\n" + "uzp2 z1.d, z14.d, z15.d\n" + "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n" + "b 4f\n" + "3:\n" + ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n" + ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n" + "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n" + ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n" + "addvl %[b_ptr], %[b_ptr], #8\n" + ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n" + ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n" + ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n" + "ld1w z4.s, p0/z, [%[b_ptr], #-8, MUL VL]\n" + ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n" + "ld1w z5.s, p0/z, [%[b_ptr], #-7, MUL VL]\n" + ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n" + ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n" + ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n" + ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n" + "ld1w z6.s, p0/z, [%[b_ptr], #-6, MUL VL]\n" + ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n" + ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n" + ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n" + ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #-5, MUL VL]\n" + ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n" + ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n" + ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n" + ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n" + "ld1w z4.s, p0/z, [%[b_ptr], #-4, MUL VL]\n" + ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n" + "ld1rqw z0.s, p0/z, [%[a_ptr], #-0x40]\n" + ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n" + "ld1rqw z1.s, p0/z, [%[a_ptr], #-0x30]\n" + ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n" + "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n" + ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n" + "ld1w z5.s, p0/z, [%[b_ptr], #-3, MUL VL]\n" + ".inst 0x64a6e408 // fmmla z8.s, z0.s, z6.s\n" + "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n" + ".inst 0x64a6e42e // fmmla z14.s, z1.s, z6.s\n" + ".inst 0x64a6e454 // fmmla z20.s, z2.s, z6.s\n" + ".inst 0x64a7e409 // fmmla z9.s, z0.s, z7.s\n" + ".inst 0x64a6e47a // fmmla z26.s, z3.s, z6.s\n" + "ld1w z6.s, p0/z, [%[b_ptr], #-2, MUL VL]\n" + ".inst 0x64a7e42f // fmmla z15.s, z1.s, z7.s\n" + ".inst 0x64a7e455 // fmmla z21.s, z2.s, z7.s\n" + ".inst 0x64a7e47b // fmmla z27.s, z3.s, z7.s\n" + "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n" + ".inst 0x64a4e40a // fmmla z10.s, z0.s, z4.s\n" + ".inst 0x64a4e430 // fmmla z16.s, z1.s, z4.s\n" + ".inst 0x64a4e456 // fmmla z22.s, z2.s, z4.s\n" + ".inst 0x64a4e47c // fmmla z28.s, z3.s, z4.s\n" + ".inst 0x64a5e40b // fmmla z11.s, z0.s, z5.s\n" + ".inst 0x64a5e431 // fmmla z17.s, z1.s, z5.s\n" + ".inst 0x64a5e457 // fmmla z23.s, z2.s, z5.s\n" + ".inst 0x64a5e47d // fmmla z29.s, z3.s, z5.s\n" + "uzp2 z4.d, z10.d, z11.d\n" + ".inst 0x64a6e40c // fmmla z12.s, z0.s, z6.s\n" + ".inst 0x64a6e432 // fmmla z18.s, z1.s, z6.s\n" + ".inst 0x64a6e458 // fmmla z24.s, z2.s, z6.s\n" + ".inst 0x64a6e47e // fmmla z30.s, z3.s, z6.s\n" + "uzp1 z6.d, z14.d, z15.d\n" + ".inst 0x64a7e40d // fmmla z13.s, z0.s, z7.s\n" + "uzp1 z0.d, z8.d, z9.d\n" + ".inst 0x64a7e433 // fmmla z19.s, z1.s, z7.s\n" + "uzp1 z1.d, z10.d, z11.d\n" + "uzp2 z5.d, z12.d, z13.d\n" + "st1w z0.s, p0, [%[c_ptr]]\n" + ".inst 0x64a7e459 // fmmla z25.s, z2.s, z7.s\n" + "uzp1 z2.d, z12.d, z13.d\n" + "uzp1 z0.d, z18.d, z19.d\n" + "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n" + "uzp2 z1.d, z14.d, z15.d\n" + ".inst 0x64a7e47f // fmmla z31.s, z3.s, z7.s\n" + "uzp2 z3.d, z8.d, z9.d\n" + "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n" + "uzp1 z7.d, z16.d, z17.d\n" + "4:\n" + "uzp2 z2.d, z16.d, z17.d\n" + "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n" + "uzp2 z3.d, z18.d, z19.d\n" + "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n" + "uzp1 z4.d, z20.d, z21.d\n" + "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n" + "uzp1 z5.d, z22.d, z23.d\n" + "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n" + "uzp1 z6.d, z24.d, z25.d\n" + "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n" + "addvl %[c_ptr], %[c_ptr], #16\n" + "uzp2 z7.d, z20.d, z21.d\n" + "st1w z0.s, p0, [%[c_ptr], #-8, MUL VL]\n" + "uzp2 z0.d, z22.d, z23.d\n" + "st1w z1.s, p0, [%[c_ptr], #-7, MUL VL]\n" + "uzp2 z1.d, z24.d, z25.d\n" + "st1w z2.s, p0, [%[c_ptr], #-6, MUL VL]\n" + "uzp1 z2.d, z26.d, z27.d\n" + "st1w z3.s, p0, [%[c_ptr], #-5, MUL VL]\n" + "uzp1 z3.d, z28.d, z29.d\n" + "st1w z4.s, p0, [%[c_ptr], #-4, MUL VL]\n" + "uzp1 z4.d, z30.d, z31.d\n" + "st1w z5.s, p0, [%[c_ptr], #-3, MUL VL]\n" + "uzp2 z5.d, z26.d, z27.d\n" + "st1w z6.s, p0, [%[c_ptr], #-2, MUL VL]\n" + "uzp2 z6.d, z28.d, z29.d\n" + "st1w z7.s, p0, [%[c_ptr], #-1, MUL VL]\n" + "uzp2 z7.d, z30.d, z31.d\n" + "st1w z0.s, p0, [%[c_ptr]]\n" + "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n" + "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n" + "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n" + "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n" + "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n" + "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n" + "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n" + "addvl %[c_ptr], %[c_ptr], #8\n" + : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [loops] "+r" (loops), [tails] "+r" (tails) + : + : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory" + ); + } + } +} + +} // namespace arm_gemm + +#endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp index effdbc63c9..e40ba215b4 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_s8s32_dot_3VLx8; - interleaved_s8s32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_s8s32_dot_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp index 7640fcaa20..cdc70705c5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -51,20 +51,20 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "mov z9.s, #0\n" "mov z10.s, #0\n" "mov z11.s, #0\n" - "mov z12.s, #0\n" "ld1rqb z0.b, p0/z, [%[a_ptr]]\n" - "mov z13.s, #0\n" + "mov z12.s, #0\n" "ld1b z4.b, p0/z, [%[b_ptr]]\n" - "mov z14.s, #0\n" + "mov z13.s, #0\n" "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n" - "mov z15.s, #0\n" + "mov z14.s, #0\n" "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n" - "mov z16.s, #0\n" + "mov z15.s, #0\n" "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n" - "mov z17.s, #0\n" + "mov z16.s, #0\n" "add %[a_ptr], %[a_ptr], #0x40\n" - "mov z18.s, #0\n" + "mov z17.s, #0\n" "addvl %[b_ptr], %[b_ptr], #3\n" + "mov z18.s, #0\n" "mov z19.s, #0\n" "mov z20.s, #0\n" "mov z21.s, #0\n" @@ -208,8 +208,8 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z9.s, z4.b, z0.b[1]\n" "sdot z10.s, z4.b, z0.b[2]\n" "sdot z11.s, z4.b, z0.b[3]\n" - "sdot z20.s, z4.b, z1.b[0]\n" "st1w z8.s, p0, [%[c_ptr]]\n" + "sdot z20.s, z4.b, z1.b[0]\n" "sdot z21.s, z4.b, z1.b[1]\n" "sdot z22.s, z4.b, z1.b[2]\n" "sdot z23.s, z4.b, z1.b[3]\n" @@ -217,8 +217,8 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z13.s, z5.b, z0.b[1]\n" "sdot z14.s, z5.b, z0.b[2]\n" "sdot z15.s, z5.b, z0.b[3]\n" - "sdot z24.s, z5.b, z1.b[0]\n" "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" + "sdot z24.s, z5.b, z1.b[0]\n" "sdot z25.s, z5.b, z1.b[1]\n" "sdot z26.s, z5.b, z1.b[2]\n" "sdot z27.s, z5.b, z1.b[3]\n" @@ -226,10 +226,11 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z17.s, z6.b, z0.b[1]\n" "sdot z18.s, z6.b, z0.b[2]\n" "sdot z19.s, z6.b, z0.b[3]\n" - "sdot z28.s, z6.b, z1.b[0]\n" "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" + "sdot z28.s, z6.b, z1.b[0]\n" "sdot z29.s, z6.b, z1.b[1]\n" "sdot z30.s, z6.b, z1.b[2]\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "sdot z31.s, z6.b, z1.b[3]\n" "b 4f\n" "3:\n" @@ -267,8 +268,8 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z9.s, z4.b, z2.b[1]\n" "sdot z10.s, z4.b, z2.b[2]\n" "sdot z11.s, z4.b, z2.b[3]\n" - "sdot z20.s, z4.b, z3.b[0]\n" "st1w z8.s, p0, [%[c_ptr]]\n" + "sdot z20.s, z4.b, z3.b[0]\n" "sdot z21.s, z4.b, z3.b[1]\n" "sdot z22.s, z4.b, z3.b[2]\n" "sdot z23.s, z4.b, z3.b[3]\n" @@ -276,8 +277,8 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z13.s, z5.b, z2.b[1]\n" "sdot z14.s, z5.b, z2.b[2]\n" "sdot z15.s, z5.b, z2.b[3]\n" - "sdot z24.s, z5.b, z3.b[0]\n" "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" + "sdot z24.s, z5.b, z3.b[0]\n" "sdot z25.s, z5.b, z3.b[1]\n" "sdot z26.s, z5.b, z3.b[2]\n" "sdot z27.s, z5.b, z3.b[3]\n" @@ -285,13 +286,13 @@ void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, "sdot z17.s, z6.b, z2.b[1]\n" "sdot z18.s, z6.b, z2.b[2]\n" "sdot z19.s, z6.b, z2.b[3]\n" - "sdot z28.s, z6.b, z3.b[0]\n" "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" + "sdot z28.s, z6.b, z3.b[0]\n" "sdot z29.s, z6.b, z3.b[1]\n" "sdot z30.s, z6.b, z3.b[2]\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "sdot z31.s, z6.b, z3.b[3]\n" "4:\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp index cd50d0ded3..361598d594 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,9 +61,9 @@ public: kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8; - interleaved_s8s32_mmla_3VLx8(const CPUInfo *ci) + interleaved_s8s32_mmla_3VLx8(const CPUInfo *) { - UNUSED(ci); + } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp index d636c9d2a4..cde9ec32e9 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,13 +63,11 @@ void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel "mov z16.s, #0\n" "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n" "mov z17.s, #0\n" - "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z18.s, #0\n" - "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n" + "addvl %[b_ptr], %[b_ptr], #4\n" "mov z19.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "mov z20.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #4\n" "mov z21.s, #0\n" "mov z22.s, #0\n" "mov z23.s, #0\n" @@ -84,12 +82,14 @@ void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel "cbz %[loops], 1f\n" "2:\n" ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n" - "subs %[loops], %[loops], #0x1\n" + "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n" + "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n" ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n" "ld1b z4.b, p0/z, [%[b_ptr]]\n" - ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n" ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n" ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" @@ -152,18 +152,18 @@ void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n" "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n" ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n" - "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" - "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n" + "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n" + "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n" - ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n" - "ld1b z4.b, p0/z, [%[b_ptr]]\n" ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n" ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n" + ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n" + "ld1b z4.b, p0/z, [%[b_ptr]]\n" ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n" @@ -269,15 +269,17 @@ void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel "b 4f\n" "3:\n" ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n" - "addvl %[b_ptr], %[b_ptr], #8\n" + "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n" - ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n" + "add %[a_ptr], %[a_ptr], #0x40\n" ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n" - "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n" + "addvl %[b_ptr], %[b_ptr], #8\n" + ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n" ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n" ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n" + "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n" ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n" "ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n" ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp index d3c8851154..252f38ec63 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,7 +61,10 @@ public: kern_type kernel=sve_interleaved_u8u32_dot_3VLx8; - interleaved_u8u32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); } + interleaved_u8u32_dot_3VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp index f4d33a9efa..6626f8463b 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -51,20 +51,20 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane "mov z9.s, #0\n" "mov z10.s, #0\n" "mov z11.s, #0\n" - "mov z12.s, #0\n" "ld1rqb z0.b, p0/z, [%[a_ptr]]\n" - "mov z13.s, #0\n" + "mov z12.s, #0\n" "ld1b z4.b, p0/z, [%[b_ptr]]\n" - "mov z14.s, #0\n" + "mov z13.s, #0\n" "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n" - "mov z15.s, #0\n" + "mov z14.s, #0\n" "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n" - "mov z16.s, #0\n" + "mov z15.s, #0\n" "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n" - "mov z17.s, #0\n" + "mov z16.s, #0\n" "add %[a_ptr], %[a_ptr], #0x40\n" - "mov z18.s, #0\n" + "mov z17.s, #0\n" "addvl %[b_ptr], %[b_ptr], #3\n" + "mov z18.s, #0\n" "mov z19.s, #0\n" "mov z20.s, #0\n" "mov z21.s, #0\n" @@ -208,8 +208,8 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane "udot z9.s, z4.b, z0.b[1]\n" "udot z10.s, z4.b, z0.b[2]\n" "udot z11.s, z4.b, z0.b[3]\n" - "udot z20.s, z4.b, z1.b[0]\n" "st1w z8.s, p0, [%[c_ptr]]\n" + "udot z20.s, z4.b, z1.b[0]\n" "udot z21.s, z4.b, z1.b[1]\n" "udot z22.s, z4.b, z1.b[2]\n" "udot z23.s, z4.b, z1.b[3]\n" @@ -217,8 +217,8 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane "udot z13.s, z5.b, z0.b[1]\n" "udot z14.s, z5.b, z0.b[2]\n" "udot z15.s, z5.b, z0.b[3]\n" - "udot z24.s, z5.b, z1.b[0]\n" "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" + "udot z24.s, z5.b, z1.b[0]\n" "udot z25.s, z5.b, z1.b[1]\n" "udot z26.s, z5.b, z1.b[2]\n" "udot z27.s, z5.b, z1.b[3]\n" @@ -226,10 +226,11 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane "udot z17.s, z6.b, z0.b[1]\n" "udot z18.s, z6.b, z0.b[2]\n" "udot z19.s, z6.b, z0.b[3]\n" - "udot z28.s, z6.b, z1.b[0]\n" "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" + "udot z28.s, z6.b, z1.b[0]\n" "udot z29.s, z6.b, z1.b[1]\n" "udot z30.s, z6.b, z1.b[2]\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "udot z31.s, z6.b, z1.b[3]\n" "b 4f\n" "3:\n" @@ -267,8 +268,8 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane "udot z9.s, z4.b, z2.b[1]\n" "udot z10.s, z4.b, z2.b[2]\n" "udot z11.s, z4.b, z2.b[3]\n" - "udot z20.s, z4.b, z3.b[0]\n" "st1w z8.s, p0, [%[c_ptr]]\n" + "udot z20.s, z4.b, z3.b[0]\n" "udot z21.s, z4.b, z3.b[1]\n" "udot z22.s, z4.b, z3.b[2]\n" "udot z23.s, z4.b, z3.b[3]\n" @@ -276,8 +277,8 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane "udot z13.s, z5.b, z2.b[1]\n" "udot z14.s, z5.b, z2.b[2]\n" "udot z15.s, z5.b, z2.b[3]\n" - "udot z24.s, z5.b, z3.b[0]\n" "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n" + "udot z24.s, z5.b, z3.b[0]\n" "udot z25.s, z5.b, z3.b[1]\n" "udot z26.s, z5.b, z3.b[2]\n" "udot z27.s, z5.b, z3.b[3]\n" @@ -285,13 +286,13 @@ void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpane "udot z17.s, z6.b, z2.b[1]\n" "udot z18.s, z6.b, z2.b[2]\n" "udot z19.s, z6.b, z2.b[3]\n" - "udot z28.s, z6.b, z3.b[0]\n" "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n" + "udot z28.s, z6.b, z3.b[0]\n" "udot z29.s, z6.b, z3.b[1]\n" "udot z30.s, z6.b, z3.b[2]\n" + "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "udot z31.s, z6.b, z3.b[3]\n" "4:\n" - "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n" "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n" "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n" "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp index 9b5ca1049e..ed44a9d8fc 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,9 +61,9 @@ public: kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8; - interleaved_u8u32_mmla_3VLx8(const CPUInfo *ci) + interleaved_u8u32_mmla_3VLx8(const CPUInfo *) { - UNUSED(ci); + } }; diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp index 15cc8fb897..81a1dbcf51 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,13 +63,11 @@ void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpan "mov z16.s, #0\n" "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n" "mov z17.s, #0\n" - "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "mov z18.s, #0\n" - "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n" + "addvl %[b_ptr], %[b_ptr], #4\n" "mov z19.s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "mov z20.s, #0\n" - "addvl %[b_ptr], %[b_ptr], #4\n" "mov z21.s, #0\n" "mov z22.s, #0\n" "mov z23.s, #0\n" @@ -84,12 +82,14 @@ void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpan "cbz %[loops], 1f\n" "2:\n" ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n" - "subs %[loops], %[loops], #0x1\n" + "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n" + "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n" ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n" "ld1b z4.b, p0/z, [%[b_ptr]]\n" - ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n" ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n" ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" @@ -152,18 +152,18 @@ void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpan ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n" "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n" ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n" - "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" - "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n" + "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n" + "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n" - ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n" - "ld1b z4.b, p0/z, [%[b_ptr]]\n" ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n" ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n" + ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n" + "ld1b z4.b, p0/z, [%[b_ptr]]\n" ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n" @@ -269,15 +269,17 @@ void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpan "b 4f\n" "3:\n" ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n" ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n" - "addvl %[b_ptr], %[b_ptr], #8\n" + "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n" ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n" - ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n" + "add %[a_ptr], %[a_ptr], #0x40\n" ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n" - "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n" + "addvl %[b_ptr], %[b_ptr], #8\n" + ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n" ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n" ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n" + "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n" ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n" "ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n" ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp index 59103d2407..6738809934 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_native_bf16fp32_dot_4VLx4; - native_bf16fp32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + native_bf16fp32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp index ce1971b2c5..d3bd89b8c5 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -62,12 +62,23 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const bfloat16 * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(bfloat16); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>())); long loops = loops_count; @@ -82,7 +93,7 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" @@ -235,46 +246,46 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b.ne 2b\n" "1:\n" "zip1 z12.h, z13.h, z14.h\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip2 z13.h, z13.h, z14.h\n" "cbz %[regs], 3f\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip1 z14.h, z15.h, z8.h\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip2 z15.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" + "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z8.h, z9.h, z10.h\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" "zip2 z9.h, z9.h, z10.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z10.h, z11.h, z12.h\n" - "ld1h z14.h, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.h, z11.h, z12.h\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z14.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" - "zip1 z12.h, z13.h, z14.h\n" + ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "zip1 z12.h, z13.h, z14.h\n" + "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip2 z13.h, z13.h, z14.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" "zip2 z15.h, z15.h, z8.h\n" - ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - "ld1h z9.h, p4/z, [%[b_ptr0]]\n" ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" @@ -452,42 +463,43 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b 7f\n" "3:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip2 z15.h, z15.h, z8.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "ld1h z10.h, p4/z, [%[b_ptr1]]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" + ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" - "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n" + "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z8.h, z9.h, z10.h\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" "zip2 z9.h, z9.h, z10.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z10.h, z11.h, z12.h\n" - "ld1h z14.h, p4/z, [%[b_ptr1]]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.h, z11.h, z12.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" - "zip1 z12.h, z13.h, z14.h\n" - "zip2 z13.h, z13.h, z14.h\n" + "ld1h z14.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n" - ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" - ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" + "zip1 z12.h, z13.h, z14.h\n" + "zip2 z13.h, z13.h, z14.h\n" "zip1 z14.h, z15.h, z8.h\n" "zip2 z15.h, z15.h, z8.h\n" + ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n" + ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n" ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n" ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n" ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n" @@ -666,37 +678,37 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "zip2 z11.h, z11.h, z12.h\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" "ld1h z14.h, p4/z, [%[b_ptr1]]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "mov z23.d, z19.d\n" "cbz %[loops], 1f\n" "2:\n" "zip1 z12.h, z13.h, z14.h\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip2 z13.h, z13.h, z14.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[loops], %[loops], #0x1\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "ld1h z9.h, p4/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip2 z15.h, z15.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" + "add a_ptr1, a_ptr1, #0x20\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" - ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z8.h, z9.h, z10.h\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z9.h, z9.h, z10.h\n" + ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n" @@ -820,26 +832,26 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "ld1h z14.h, p4/z, [%[b_ptr1]]\n" ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "b.ne 2b\n" "1:\n" "zip1 z12.h, z13.h, z14.h\n" "zip2 z13.h, z13.h, z14.h\n" "cbz %[regs], 3f\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip1 z14.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.h, z15.h, z8.h\n" - ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" @@ -1103,28 +1115,29 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b 7f\n" "3:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" + ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip2 z15.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z8.h, z9.h, z10.h\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z9.h, z9.h, z10.h\n" + "addvl a_ptr1, a_ptr1, #1\n" ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" @@ -1386,34 +1399,34 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "mov z27.d, z19.d\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "ld1h z14.h, p4/z, [%[b_ptr1]]\n" "cbz %[loops], 1f\n" "2:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z12.h, z13.h, z14.h\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "zip2 z13.h, z13.h, z14.h\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "subs %[loops], %[loops], #0x1\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.h, z15.h, z8.h\n" - "add a_ptr1, a_ptr1, #0x20\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" + "add a_ptr1, a_ptr1, #0x20\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" + "add a_ptr2, a_ptr2, #0x20\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" @@ -1576,28 +1589,28 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n" ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n" ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "b.ne 2b\n" "1:\n" "zip1 z12.h, z13.h, z14.h\n" "zip2 z13.h, z13.h, z14.h\n" "cbz %[regs], 3f\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" "zip1 z14.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.h, z15.h, z8.h\n" - ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" @@ -1922,35 +1935,36 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b 7f\n" "3:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" + "ld1rqh z6.h, p6/z, [a_ptr2]\n" + ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - "zip2 z15.h, z15.h, z8.h\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" + "zip2 z15.h, z15.h, z8.h\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" - "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "zip1 z8.h, z9.h, z10.h\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z9.h, z9.h, z10.h\n" + ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" + "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n" ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n" "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" @@ -2276,7 +2290,6 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "mov z31.d, z19.d\n" "ld1h z13.h, p4/z, [%[b_ptr0]]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "ld1h z14.h, p4/z, [%[b_ptr1]]\n" "zip1 z12.h, z13.h, z14.h\n" @@ -2284,38 +2297,39 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "cbz %[loops], 1f\n" "2:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z7.h, p7/z, [a_ptr3]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "subs %[loops], %[loops], #0x1\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - "zip2 z15.h, z15.h, z8.h\n" - "add %[a_ptr0], %[a_ptr0], #0x20\n" - ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" + "zip2 z15.h, z15.h, z8.h\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "add a_ptr1, a_ptr1, #0x20\n" + "add %[a_ptr0], %[a_ptr0], #0x20\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "add a_ptr2, a_ptr2, #0x20\n" + "add a_ptr1, a_ptr1, #0x20\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" - "add a_ptr3, a_ptr3, #0x20\n" + "add a_ptr2, a_ptr2, #0x20\n" ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" + "add a_ptr3, a_ptr3, #0x20\n" ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n" - ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" "zip1 z8.h, z9.h, z10.h\n" "zip2 z9.h, z9.h, z10.h\n" + ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n" ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n" "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n" @@ -2503,28 +2517,28 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "zip1 z12.h, z13.h, z14.h\n" "zip2 z13.h, z13.h, z14.h\n" ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n" - "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" "b.ne 2b\n" "1:\n" "cbz %[regs], 3f\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" + "ld1rqh z7.h, p7/z, [a_ptr3]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" - "zip1 z14.h, z15.h, z8.h\n" - "zip2 z15.h, z15.h, z8.h\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" + "zip1 z14.h, z15.h, z8.h\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "zip2 z15.h, z15.h, z8.h\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" @@ -2910,30 +2924,31 @@ void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B "b 7f\n" "3:\n" ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n" ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n" "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n" ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" + "ld1rqh z6.h, p6/z, [a_ptr2]\n" ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n" + "ld1rqh z7.h, p6/z, [a_ptr3]\n" + ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.h, z15.h, z8.h\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.h, z15.h, z8.h\n" "addvl %[a_ptr0], %[a_ptr0], #1\n" - ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n" "ld1h z9.h, p4/z, [%[b_ptr0]]\n" ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n" "ld1h z10.h, p4/z, [%[b_ptr1]]\n" ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp index 741f200d25..665e8656d2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,9 @@ #ifdef __ARM_FEATURE_SVE + + + namespace arm_gemm { @@ -75,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_native_fp16_mla_4VLx4; - native_fp16_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + native_fp16_mla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp index 14dd38bd25..dd33c785cf 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,12 +60,23 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const __fp16 * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(__fp16); __fp16 *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>())); long loops = loops_count; @@ -78,7 +89,7 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld const unsigned long ldcb = ldc * sizeof(__fp16); const __fp16 *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.h, %[temp], %[leftovers]\n" @@ -256,88 +267,87 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z19.h, z11.h, z0.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z17.h, z13.h, z0.h[1]\n" - "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z18.h, z14.h, z0.h[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z19.h, z15.h, z0.h[1]\n" - "ld1h z12.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z8.h, z0.h[2]\n" - "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z17.h, z9.h, z0.h[2]\n" - "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z10.h, z0.h[2]\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z11.h, z0.h[2]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[3]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[3]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[3]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[3]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z9.h, z0.h[4]\n" + "fmla z16.h, z8.h, z0.h[4]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[4]\n" + "fmla z17.h, z9.h, z0.h[4]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" + "fmla z18.h, z10.h, z0.h[4]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z11.h, z0.h[4]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[5]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[5]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[5]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z9.h, z0.h[6]\n" + "fmla z16.h, z8.h, z0.h[6]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[6]\n" + "fmla z17.h, z9.h, z0.h[6]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" + "fmla z18.h, z10.h, z0.h[6]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z11.h, z0.h[6]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z12.h, z0.h[7]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[7]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[7]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[7]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z4.h[0]\n" - "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z17.h, z9.h, z4.h[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z18.h, z10.h, z4.h[0]\n" + "fmla z16.h, z8.h, z4.h[0]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z19.h, z11.h, z4.h[0]\n" + "fmla z17.h, z9.h, z4.h[0]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.h, z10.h, z4.h[0]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z16.h, z12.h, z4.h[1]\n" + "fmla z19.h, z11.h, z4.h[0]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.h, z12.h, z4.h[1]\n" + "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n" "fmla z17.h, z13.h, z4.h[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.h, z14.h, z4.h[1]\n" @@ -345,51 +355,52 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z19.h, z15.h, z4.h[1]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + "fmla z16.h, z8.h, z4.h[2]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.h, z9.h, z4.h[2]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z16.h, z8.h, z4.h[2]\n" + "fmla z18.h, z10.h, z4.h[2]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z17.h, z9.h, z4.h[2]\n" + "fmla z19.h, z11.h, z4.h[2]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.h, z10.h, z4.h[2]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.h, z11.h, z4.h[2]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[3]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z13.h, z4.h[3]\n" + "fmla z16.h, z12.h, z4.h[3]\n" "ld1h z8.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z14.h, z4.h[3]\n" + "fmla z17.h, z13.h, z4.h[3]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[3]\n" + "fmla z18.h, z14.h, z4.h[3]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z15.h, z4.h[3]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z8.h, z4.h[4]\n" - "ld1h z12.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z9.h, z4.h[4]\n" - "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z10.h, z4.h[4]\n" - "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z11.h, z4.h[4]\n" + "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z12.h, z4.h[5]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z13.h, z4.h[5]\n" + "fmla z16.h, z12.h, z4.h[5]\n" "ld1h z8.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z14.h, z4.h[5]\n" + "fmla z17.h, z13.h, z4.h[5]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z15.h, z4.h[5]\n" + "fmla z18.h, z14.h, z4.h[5]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z15.h, z4.h[5]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z8.h, z4.h[6]\n" - "ld1h z12.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z9.h, z4.h[6]\n" - "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z10.h, z4.h[6]\n" - "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z11.h, z4.h[6]\n" + "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z4.h[7]\n" "fmla z17.h, z13.h, z4.h[7]\n" @@ -474,66 +485,67 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "b 4f\n" "3:\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z19.h, z11.h, z0.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z16.h, z12.h, z0.h[1]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z17.h, z13.h, z0.h[1]\n" - "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z18.h, z14.h, z0.h[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z19.h, z15.h, z0.h[1]\n" - "ld1h z12.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z8.h, z0.h[2]\n" - "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z17.h, z9.h, z0.h[2]\n" - "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z10.h, z0.h[2]\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z11.h, z0.h[2]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[3]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[3]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[3]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[3]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[4]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z9.h, z0.h[4]\n" + "fmla z16.h, z8.h, z0.h[4]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[4]\n" + "fmla z17.h, z9.h, z0.h[4]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[4]\n" + "fmla z18.h, z10.h, z0.h[4]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z11.h, z0.h[4]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.h, z12.h, z0.h[5]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.h, z13.h, z0.h[5]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z14.h, z0.h[5]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z15.h, z0.h[5]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.h, z8.h, z0.h[6]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z17.h, z9.h, z0.h[6]\n" + "fmla z16.h, z8.h, z0.h[6]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" - "fmla z18.h, z10.h, z0.h[6]\n" + "fmla z17.h, z9.h, z0.h[6]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z19.h, z11.h, z0.h[6]\n" + "fmla z18.h, z10.h, z0.h[6]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "fmla z19.h, z11.h, z0.h[6]\n" "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z16.h, z12.h, z0.h[7]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z17.h, z13.h, z0.h[7]\n" "fmla z18.h, z14.h, z0.h[7]\n" "fmla z19.h, z15.h, z0.h[7]\n" @@ -888,21 +900,21 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z23.h, z15.h, z5.h[7]\n" "b.ne 2b\n" "1:\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z22.h, z10.h, z1.h[0]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z11.h, z0.h[0]\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z23.h, z11.h, z1.h[0]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[1]\n" @@ -1201,19 +1213,19 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "b 4f\n" "3:\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z22.h, z10.h, z1.h[0]\n" - "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.h, z11.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z23.h, z11.h, z1.h[0]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[1]\n" @@ -1221,10 +1233,11 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z20.h, z12.h, z1.h[1]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z17.h, z13.h, z0.h[1]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z21.h, z13.h, z1.h[1]\n" "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z14.h, z0.h[1]\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z22.h, z14.h, z1.h[1]\n" "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z15.h, z0.h[1]\n" @@ -1509,9 +1522,9 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z27.h, z11.h, z2.h[0]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla z20.h, z12.h, z1.h[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.h, z12.h, z1.h[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" "fmla z24.h, z12.h, z2.h[1]\n" "ld1h z12.h, p0/z, [%[b_ptr0]]\n" "fmla z17.h, z13.h, z0.h[1]\n" @@ -1768,21 +1781,21 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z27.h, z15.h, z6.h[7]\n" "b.ne 2b\n" "1:\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.h, z9.h, z2.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z10.h, z0.h[0]\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z22.h, z10.h, z1.h[0]\n" "fmla z26.h, z10.h, z2.h[0]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" @@ -2176,26 +2189,27 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "b 4f\n" "3:\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z6.h, p6/z, [a_ptr2]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.h, z9.h, z2.h[0]\n" - "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z22.h, z10.h, z1.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z26.h, z10.h, z2.h[0]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z11.h, z0.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z23.h, z11.h, z1.h[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" "fmla z27.h, z11.h, z2.h[0]\n" "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.h, z12.h, z0.h[1]\n" @@ -2897,21 +2911,21 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "fmla z31.h, z15.h, z7.h[7]\n" "b.ne 2b\n" "1:\n" - "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p7/z, [a_ptr1]\n" + "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n" "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p7/z, [a_ptr2]\n" + "ld1rqh z5.h, p7/z, [a_ptr1]\n" "fmla z28.h, z8.h, z3.h[0]\n" - "ld1rqh z7.h, p7/z, [a_ptr3]\n" + "ld1rqh z6.h, p7/z, [a_ptr2]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z7.h, p7/z, [a_ptr3]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.h, z9.h, z2.h[0]\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z29.h, z9.h, z3.h[0]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z10.h, z0.h[0]\n" @@ -3400,30 +3414,31 @@ void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ld "b 4f\n" "3:\n" "fmla z16.h, z8.h, z0.h[0]\n" - "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" + "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.h, z8.h, z1.h[0]\n" - "ld1rqh z5.h, p6/z, [a_ptr1]\n" + "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n" "fmla z24.h, z8.h, z2.h[0]\n" - "ld1rqh z6.h, p6/z, [a_ptr2]\n" + "ld1rqh z5.h, p6/z, [a_ptr1]\n" "fmla z28.h, z8.h, z3.h[0]\n" - "ld1rqh z7.h, p6/z, [a_ptr3]\n" + "ld1rqh z6.h, p6/z, [a_ptr2]\n" "fmla z17.h, z9.h, z0.h[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqh z7.h, p6/z, [a_ptr3]\n" "fmla z21.h, z9.h, z1.h[0]\n" - "ld1h z8.h, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.h, z9.h, z2.h[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1h z8.h, p0/z, [%[b_ptr0]]\n" "fmla z29.h, z9.h, z3.h[0]\n" "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.h, z10.h, z0.h[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z22.h, z10.h, z1.h[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z26.h, z10.h, z2.h[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" "fmla z30.h, z10.h, z3.h[0]\n" "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.h, z11.h, z0.h[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" "fmla z23.h, z11.h, z1.h[0]\n" "fmla z27.h, z11.h, z2.h[0]\n" "fmla z31.h, z11.h, z3.h[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp index 19e5fbd974..0abde56af1 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,9 @@ #ifdef __ARM_FEATURE_SVE + + + namespace arm_gemm { @@ -75,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_native_fp32_mla_4VLx4; - native_fp32_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + native_fp32_mla_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp index 3fc0e5fa36..b05906e199 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,12 +60,23 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, break; } - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const float * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(float); float *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>())); long loops = loops_count; @@ -78,7 +89,7 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, const unsigned long ldcb = ldc * sizeof(float); const float *biasptr = bias ? bias+x0 : nullbias; - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "whilelt p6.s, %[temp], %[leftovers]\n" @@ -184,52 +195,51 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z19.s, z11.s, z0.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z17.s, z13.s, z0.s[1]\n" - "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z18.s, z14.s, z0.s[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z19.s, z15.s, z0.s[1]\n" - "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.s, z8.s, z0.s[2]\n" - "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "fmla z17.s, z9.s, z0.s[2]\n" - "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z10.s, z0.s[2]\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z11.s, z0.s[2]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[3]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z17.s, z13.s, z0.s[3]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z18.s, z14.s, z0.s[3]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.s, z15.s, z0.s[3]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" - "fmla z16.s, z8.s, z4.s[0]\n" - "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" - "fmla z17.s, z9.s, z4.s[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z18.s, z10.s, z4.s[0]\n" + "fmla z16.s, z8.s, z4.s[0]\n" "ld1w z12.s, p0/z, [%[b_ptr0]]\n" - "fmla z19.s, z11.s, z4.s[0]\n" + "fmla z17.s, z9.s, z4.s[0]\n" "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "fmla z18.s, z10.s, z4.s[0]\n" "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "addvl %[a_ptr0], %[a_ptr0], #2\n" - "fmla z16.s, z12.s, z4.s[1]\n" + "fmla z19.s, z11.s, z4.s[0]\n" "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z16.s, z12.s, z4.s[1]\n" + "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n" "fmla z17.s, z13.s, z4.s[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.s, z14.s, z4.s[1]\n" @@ -237,15 +247,16 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z19.s, z15.s, z4.s[1]\n" "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "addvl %[a_ptr0], %[a_ptr0], #2\n" + "fmla z16.s, z8.s, z4.s[2]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "fmla z17.s, z9.s, z4.s[2]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "fmla z16.s, z8.s, z4.s[2]\n" + "fmla z18.s, z10.s, z4.s[2]\n" "ld1w z12.s, p0/z, [%[b_ptr0]]\n" - "fmla z17.s, z9.s, z4.s[2]\n" + "fmla z19.s, z11.s, z4.s[2]\n" "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" - "fmla z18.s, z10.s, z4.s[2]\n" "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" - "fmla z19.s, z11.s, z4.s[2]\n" "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z4.s[3]\n" "fmla z17.s, z13.s, z4.s[3]\n" @@ -286,30 +297,31 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "b 4f\n" "3:\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z19.s, z11.s, z0.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z16.s, z12.s, z0.s[1]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z17.s, z13.s, z0.s[1]\n" - "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z18.s, z14.s, z0.s[1]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z19.s, z15.s, z0.s[1]\n" - "ld1w z12.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z16.s, z8.s, z0.s[2]\n" - "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "fmla z17.s, z9.s, z0.s[2]\n" - "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z10.s, z0.s[2]\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" + "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z11.s, z0.s[2]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[3]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z17.s, z13.s, z0.s[3]\n" "fmla z18.s, z14.s, z0.s[3]\n" "fmla z19.s, z15.s, z0.s[3]\n" @@ -516,21 +528,21 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z23.s, z15.s, z5.s[3]\n" "b.ne 2b\n" "1:\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z22.s, z10.s, z1.s[0]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.s, z11.s, z0.s[0]\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z23.s, z11.s, z1.s[0]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[1]\n" @@ -665,19 +677,19 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "b 4f\n" "3:\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z22.s, z10.s, z1.s[0]\n" - "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z19.s, z11.s, z0.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z23.s, z11.s, z1.s[0]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[1]\n" @@ -685,10 +697,11 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z20.s, z12.s, z1.s[1]\n" "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "fmla z17.s, z13.s, z0.s[1]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z21.s, z13.s, z1.s[1]\n" "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z14.s, z0.s[1]\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z22.s, z14.s, z1.s[1]\n" "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z15.s, z0.s[1]\n" @@ -861,9 +874,9 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z27.s, z11.s, z2.s[0]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[1]\n" - "add a_ptr2, a_ptr2, #0x20\n" - "fmla z20.s, z12.s, z1.s[1]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "fmla z20.s, z12.s, z1.s[1]\n" + "add a_ptr2, a_ptr2, #0x20\n" "fmla z24.s, z12.s, z2.s[1]\n" "ld1w z12.s, p0/z, [%[b_ptr0]]\n" "fmla z17.s, z13.s, z0.s[1]\n" @@ -984,21 +997,21 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z27.s, z15.s, z6.s[3]\n" "b.ne 2b\n" "1:\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.s, z9.s, z2.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z18.s, z10.s, z0.s[0]\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z22.s, z10.s, z1.s[0]\n" "fmla z26.s, z10.s, z2.s[0]\n" "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" @@ -1180,26 +1193,27 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "b 4f\n" "3:\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.s, z9.s, z2.s[0]\n" - "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z22.s, z10.s, z1.s[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z26.s, z10.s, z2.s[0]\n" "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z11.s, z0.s[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z23.s, z11.s, z1.s[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" "fmla z27.s, z11.s, z2.s[0]\n" "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z16.s, z12.s, z0.s[1]\n" @@ -1589,21 +1603,21 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "fmla z31.s, z15.s, z7.s[3]\n" "b.ne 2b\n" "1:\n" - "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "cbz %[regs], 3f\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p7/z, [a_ptr1]\n" + "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n" "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z6.s, p7/z, [a_ptr2]\n" + "ld1rqw z5.s, p7/z, [a_ptr1]\n" "fmla z28.s, z8.s, z3.s[0]\n" - "ld1rqw z7.s, p7/z, [a_ptr3]\n" + "ld1rqw z6.s, p7/z, [a_ptr2]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z7.s, p7/z, [a_ptr3]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.s, z9.s, z2.s[0]\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z29.s, z9.s, z3.s[0]\n" "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z10.s, z0.s[0]\n" @@ -1832,30 +1846,31 @@ void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, "b 4f\n" "3:\n" "fmla z16.s, z8.s, z0.s[0]\n" - "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" + "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n" "fmla z20.s, z8.s, z1.s[0]\n" - "ld1rqw z5.s, p6/z, [a_ptr1]\n" + "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n" "fmla z24.s, z8.s, z2.s[0]\n" - "ld1rqw z6.s, p6/z, [a_ptr2]\n" + "ld1rqw z5.s, p6/z, [a_ptr1]\n" "fmla z28.s, z8.s, z3.s[0]\n" - "ld1rqw z7.s, p6/z, [a_ptr3]\n" + "ld1rqw z6.s, p6/z, [a_ptr2]\n" "fmla z17.s, z9.s, z0.s[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqw z7.s, p6/z, [a_ptr3]\n" "fmla z21.s, z9.s, z1.s[0]\n" - "ld1w z8.s, p0/z, [%[b_ptr0]]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "fmla z25.s, z9.s, z2.s[0]\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "ld1w z8.s, p0/z, [%[b_ptr0]]\n" "fmla z29.s, z9.s, z3.s[0]\n" "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n" "fmla z18.s, z10.s, z0.s[0]\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "fmla z22.s, z10.s, z1.s[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "fmla z26.s, z10.s, z2.s[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" "fmla z30.s, z10.s, z3.s[0]\n" "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n" "fmla z19.s, z11.s, z0.s[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" "fmla z23.s, z11.s, z1.s[0]\n" "fmla z27.s, z11.s, z2.s[0]\n" "fmla z31.s, z11.s, z3.s[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp index 1b9d1312b5..40a69b54ff 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_native_s8s32_dot_4VLx4; - native_s8s32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + native_s8s32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp index 26736f597a..7c5d4dc280 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp @@ -32,7 +32,7 @@ namespace arm_gemm { -void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool append) { +void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool append) { const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; const long regs_count = (K / 16) - 1; @@ -41,12 +41,23 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l const long blocks_count = K / 4; const long odds_count = K - (blocks_count * 4); - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const int8_t * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(int8_t); int32_t *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>())); long loops = loops_count; @@ -62,7 +73,7 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l long ldbb = ldb * sizeof(int8_t) * 4; const unsigned long ldcb = ldc * sizeof(int32_t); - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "mov z16.s, #0\n" @@ -270,22 +281,22 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "ld1b z12.b, p4/z, [%[b_ptr0]]\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "sdot z19.s, z11.b, z0.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" - "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip2 z13.b, z13.b, z14.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" @@ -635,33 +646,34 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "b 7f\n" "3:\n" "sdot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "zip2 z8.b, z14.b, z12.b\n" + "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" + "sdot z19.s, z11.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip2 z13.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" - "zip1 z14.b, z15.b, z8.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z16.s, z12.b, z0.b[1]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "sdot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" "zip2 z11.b, z8.b, z9.b\n" "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" - "sdot z17.s, z13.b, z0.b[1]\n" - "ld1b z13.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z14.b, z0.b[1]\n" "ld1b z14.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z15.b, z0.b[1]\n" @@ -998,11 +1010,11 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "sdot z21.s, z9.b, z1.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "sdot z22.s, z10.b, z1.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z23.s, z11.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.b, z8.b, z9.b\n" @@ -1176,34 +1188,34 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "zip2 z8.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z23.s, z11.b, z1.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.b, z8.b, z9.b\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" @@ -1604,34 +1616,35 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "b 7f\n" "3:\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "zip2 z8.b, z14.b, z12.b\n" + "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "sdot z22.s, z10.b, z1.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" - "addvl a_ptr1, a_ptr1, #1\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "sdot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z23.s, z11.b, z1.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip2 z11.b, z8.b, z9.b\n" + "addvl a_ptr1, a_ptr1, #1\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" "sdot z16.s, z12.b, z0.b[1]\n" @@ -2242,19 +2255,20 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z6.b, p7/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" @@ -2262,13 +2276,12 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "zip1 z14.b, z15.b, z8.b\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z25.s, z9.b, z2.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "sdot z26.s, z10.b, z2.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" @@ -2733,16 +2746,18 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "b 7f\n" "3:\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" "sdot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "sdot z17.s, z9.b, z0.b[0]\n" "ld1rqb z6.b, p6/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "sdot z17.s, z9.b, z0.b[0]\n" + "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" @@ -2752,19 +2767,18 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "addvl a_ptr1, a_ptr1, #1\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "sdot z21.s, z9.b, z1.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z25.s, z9.b, z2.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "sdot z26.s, z10.b, z2.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "sdot z23.s, z11.b, z1.b[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" "sdot z27.s, z11.b, z2.b[0]\n" "zip2 z11.b, z8.b, z9.b\n" "zip1 z9.b, z8.b, z9.b\n" @@ -3469,25 +3483,25 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" "sdot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" "sdot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" "zip1 z14.b, z14.b, z12.b\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z17.s, z9.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" @@ -4023,38 +4037,39 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l "b 7f\n" "3:\n" "sdot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "sdot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" "sdot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" "sdot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z7.b, p6/z, [a_ptr3]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqb z7.b, p6/z, [a_ptr3]\n" "zip1 z14.b, z14.b, z12.b\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z17.s, z9.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" "sdot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "sdot z25.s, z9.b, z2.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "sdot z29.s, z9.b, z3.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "sdot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "sdot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" "sdot z26.s, z10.b, z2.b[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" "sdot z30.s, z10.b, z3.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "sdot z19.s, z11.b, z0.b[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp index 33e3ac6c23..043fa7484a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -78,7 +78,10 @@ public: // Default to the generic kernel kern_type kernel=sve_native_u8u32_dot_4VLx4; - native_u8u32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); } + native_u8u32_dot_4VLx4(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp index 639ca5765c..bbc1092e4e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp @@ -32,7 +32,7 @@ namespace arm_gemm { -void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool append) { +void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) { const long loops_count = ((K + 16) / 32) - 1; K -= loops_count * 32; const long regs_count = (K / 16) - 1; @@ -41,12 +41,23 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int const long blocks_count = K / 4; const long odds_count = K - (blocks_count * 4); - for (int y=0; y<M; y+=4) { + int rows_to_compute; + + for (int y=0; y<M; y+=rows_to_compute) { const uint8_t * const a_ptr0_base = A + (y * lda); const unsigned long ldab = lda * sizeof(uint8_t); uint32_t *c_ptr0 = C + (y * ldc); + rows_to_compute = M-y; + if (rows_to_compute > 4) { + if (rows_to_compute % 4) { + rows_to_compute = 4 - 1; + } else { + rows_to_compute = 4; + } + } + for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) { const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>())); long loops = loops_count; @@ -62,7 +73,7 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int long ldbb = ldb * sizeof(uint8_t) * 4; const unsigned long ldcb = ldc * sizeof(uint32_t); - switch(M-y) { + switch(rows_to_compute) { case 1: __asm __volatile ( "mov z16.s, #0\n" @@ -270,22 +281,22 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "ld1b z12.b, p4/z, [%[b_ptr0]]\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z17.s, z9.b, z0.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "udot z19.s, z11.b, z0.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" - "zip1 z12.b, z13.b, z14.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "zip1 z12.b, z13.b, z14.b\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip2 z13.b, z13.b, z14.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" @@ -635,33 +646,34 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "b 7f\n" "3:\n" "udot z16.s, z8.b, z0.b[0]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" - "zip2 z8.b, z14.b, z12.b\n" + "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "udot z18.s, z10.b, z0.b[0]\n" + "udot z19.s, z11.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip2 z13.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" - "zip1 z14.b, z15.b, z8.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z16.s, z12.b, z0.b[1]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "udot z17.s, z13.b, z0.b[1]\n" + "ld1b z13.b, p4/z, [%[b_ptr2]]\n" "zip2 z11.b, z8.b, z9.b\n" "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" - "udot z17.s, z13.b, z0.b[1]\n" - "ld1b z13.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z14.b, z0.b[1]\n" "ld1b z14.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z15.b, z0.b[1]\n" @@ -998,11 +1010,11 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "udot z21.s, z9.b, z1.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "udot z22.s, z10.b, z1.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z23.s, z11.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.b, z8.b, z9.b\n" @@ -1176,34 +1188,34 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z5.b, p7/z, [a_ptr1]\n" - "zip2 z8.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z23.s, z11.b, z1.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z11.b, z8.b, z9.b\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" @@ -1604,34 +1616,35 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "b 7f\n" "3:\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z5.b, p6/z, [a_ptr1]\n" - "zip2 z8.b, z14.b, z12.b\n" + "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "zip1 z14.b, z14.b, z12.b\n" + "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "zip1 z14.b, z14.b, z12.b\n" + "ld1b z9.b, p4/z, [%[b_ptr2]]\n" + "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "udot z22.s, z10.b, z1.b[0]\n" + "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" - "addvl a_ptr1, a_ptr1, #1\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "ld1b z9.b, p4/z, [%[b_ptr2]]\n" - "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" - "udot z22.s, z10.b, z1.b[0]\n" - "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z23.s, z11.b, z1.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip2 z11.b, z8.b, z9.b\n" + "addvl a_ptr1, a_ptr1, #1\n" "zip1 z9.b, z8.b, z9.b\n" "ld1b z8.b, p4/z, [%[b_ptr3]]\n" "udot z16.s, z12.b, z0.b[1]\n" @@ -2242,19 +2255,20 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z6.b, p7/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" @@ -2262,13 +2276,12 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "zip1 z14.b, z15.b, z8.b\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z25.s, z9.b, z2.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z22.s, z10.b, z1.b[0]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "udot z26.s, z10.b, z2.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" @@ -2733,16 +2746,18 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "b 7f\n" "3:\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" "udot z24.s, z8.b, z2.b[0]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "udot z17.s, z9.b, z0.b[0]\n" "ld1rqb z6.b, p6/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "zip1 z14.b, z14.b, z12.b\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" - "udot z17.s, z9.b, z0.b[0]\n" + "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" "add %[b_ptr3], %[b_ptr3], %[ldb]\n" @@ -2752,19 +2767,18 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "addvl a_ptr1, a_ptr1, #1\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" - "udot z21.s, z9.b, z1.b[0]\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z25.s, z9.b, z2.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z10.b, z0.b[0]\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "udot z26.s, z10.b, z2.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "udot z23.s, z11.b, z1.b[0]\n" + "addvl a_ptr2, a_ptr2, #1\n" "udot z27.s, z11.b, z2.b[0]\n" "zip2 z11.b, z8.b, z9.b\n" "zip1 z9.b, z8.b, z9.b\n" @@ -3469,25 +3483,25 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "1:\n" "zip2 z15.b, z12.b, z13.b\n" "zip1 z13.b, z12.b, z13.b\n" - "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "cbz %[regs], 3f\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p7/z, [a_ptr1]\n" + "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n" "udot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z6.b, p7/z, [a_ptr2]\n" + "ld1rqb z5.b, p7/z, [a_ptr1]\n" "udot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z7.b, p7/z, [a_ptr3]\n" + "ld1rqb z6.b, p7/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqb z7.b, p7/z, [a_ptr3]\n" "zip1 z14.b, z14.b, z12.b\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z17.s, z9.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" @@ -4023,38 +4037,39 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int "b 7f\n" "3:\n" "udot z16.s, z8.b, z0.b[0]\n" - "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" + "ld1b z12.b, p4/z, [%[b_ptr3]]\n" "udot z20.s, z8.b, z1.b[0]\n" - "ld1rqb z5.b, p6/z, [a_ptr1]\n" + "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n" "udot z24.s, z8.b, z2.b[0]\n" - "ld1rqb z6.b, p6/z, [a_ptr2]\n" + "ld1rqb z5.b, p6/z, [a_ptr1]\n" "udot z28.s, z8.b, z3.b[0]\n" - "ld1rqb z7.b, p6/z, [a_ptr3]\n" + "ld1rqb z6.b, p6/z, [a_ptr2]\n" "zip2 z8.b, z14.b, z12.b\n" - "add %[b_ptr0], %[b_ptr0], %[ldb]\n" + "ld1rqb z7.b, p6/z, [a_ptr3]\n" "zip1 z14.b, z14.b, z12.b\n" - "add %[b_ptr2], %[b_ptr2], %[ldb]\n" + "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z17.s, z9.b, z0.b[0]\n" - "add %[b_ptr1], %[b_ptr1], %[ldb]\n" + "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "zip1 z12.b, z13.b, z14.b\n" - "add %[b_ptr3], %[b_ptr3], %[ldb]\n" + "add %[b_ptr1], %[b_ptr1], %[ldb]\n" "zip2 z13.b, z13.b, z14.b\n" - "addvl %[a_ptr0], %[a_ptr0], #1\n" + "add %[b_ptr3], %[b_ptr3], %[ldb]\n" "zip1 z14.b, z15.b, z8.b\n" - "addvl a_ptr1, a_ptr1, #1\n" + "addvl %[a_ptr0], %[a_ptr0], #1\n" "zip2 z15.b, z15.b, z8.b\n" "ld1b z8.b, p4/z, [%[b_ptr0]]\n" "udot z21.s, z9.b, z1.b[0]\n" "add %[b_ptr0], %[b_ptr0], %[ldb]\n" "udot z25.s, z9.b, z2.b[0]\n" - "addvl a_ptr2, a_ptr2, #1\n" + "addvl a_ptr1, a_ptr1, #1\n" "udot z29.s, z9.b, z3.b[0]\n" "ld1b z9.b, p4/z, [%[b_ptr2]]\n" "udot z18.s, z10.b, z0.b[0]\n" "add %[b_ptr2], %[b_ptr2], %[ldb]\n" "udot z22.s, z10.b, z1.b[0]\n" - "addvl a_ptr3, a_ptr3, #1\n" + "addvl a_ptr2, a_ptr2, #1\n" "udot z26.s, z10.b, z2.b[0]\n" + "addvl a_ptr3, a_ptr3, #1\n" "udot z30.s, z10.b, z3.b[0]\n" "ld1b z10.b, p4/z, [%[b_ptr1]]\n" "udot z19.s, z11.b, z0.b[0]\n" diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp index 9bee502236..6b070d6d71 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 ARM Limited. + * Copyright (c) 2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -77,7 +77,10 @@ public: // Default to the generic kernel kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx8; - smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *ci) { UNUSED(ci); } + smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *ci) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp index fc18cbdbbf..9bc0969bf2 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp @@ -77,7 +77,10 @@ public: // Default to the generic kernel kern_type kernel=sve_smallK_hybrid_s8s32_dot_1VLx8; - smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *ci) { UNUSED(ci); } + smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp index 51d3e736ed..cc27c13533 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp @@ -77,7 +77,10 @@ public: // Default to the generic kernel kern_type kernel=sve_smallK_hybrid_u8u32_dot_1VLx8; - smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *ci) { UNUSED(ci); } + smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *) + { + + } }; } // namespace arm_gemm |