diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp | 53 |
1 files changed, 39 insertions, 14 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp index be87f442ea..7f0eff29af 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -59,42 +59,65 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 * "movi v13.4s, #0\n" "ldr q6, [%[b_ptr], #0x20]\n" "movi v14.4s, #0\n" - "ldr q3, [%[a_ptr], #0x30]\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x40]\n" "movi v15.4s, #0\n" - "ldr q7, [%[b_ptr], #0x30]\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x40]\n" "movi v16.4s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x80]\n" "movi v17.4s, #0\n" - "add %[b_ptr], %[b_ptr], #0x40\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x80]\n" "movi v18.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0xc0]\n" "movi v19.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0xc0]\n" "movi v20.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x100]\n" "movi v21.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x100]\n" "movi v22.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x140]\n" "movi v23.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x140]\n" "movi v24.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x180]\n" "movi v25.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x180]\n" "movi v26.4s, #0\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n" "movi v27.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n" "movi v28.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n" "movi v29.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n" "movi v30.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n" "movi v31.4s, #0\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + "add %[b_ptr], %[b_ptr], #0x40\n" "cbz %[loops], 1f\n" "2:\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "subs %[loops], %[loops], #0x1\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + "subs %[loops], %[loops], #0x1\n" + ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n" ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" "ldr q4, [%[b_ptr]]\n" - ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n" ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" + "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n" ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n" "ldr q5, [%[b_ptr], #0x10]\n" ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n" ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n" + "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n" ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n" "ldr q6, [%[b_ptr], #0x20]\n" @@ -151,18 +174,18 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 * ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n" "ldr q2, [%[a_ptr], #-0x20]\n" ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n" - "ldr q7, [%[b_ptr], #-0x10]\n" - "ldr q3, [%[a_ptr], #-0x10]\n" "b.ne 2b\n" "1:\n" "cbz %[tails], 3f\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" - ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" - "ldr q4, [%[b_ptr]]\n" ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n" + ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" + "ldr q4, [%[b_ptr]]\n" ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n" "ldr q5, [%[b_ptr], #0x10]\n" @@ -268,13 +291,15 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 * "b 4f\n" "3:\n" ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n" - "add %[a_ptr], %[a_ptr], #0x40\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n" - "add %[b_ptr], %[b_ptr], #0x80\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n" + "add %[a_ptr], %[a_ptr], #0x40\n" + ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" + "add %[b_ptr], %[b_ptr], #0x80\n" ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n" "ldr q4, [%[b_ptr], #-0x80]\n" - ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n" ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n" ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n" ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n" |