From a00180a7a5e957540493d666565eb39c0cd1f122 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Fri, 26 Mar 2021 14:47:24 +0000 Subject: Update U8 mmla kernel Change-Id: Ia12a8761ea77a467f3382d06bb222fe5f165925e Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5333 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../a64_interleaved_u8u32_mmla_8x12/generic.cpp | 102 +++++++++++++-------- 1 file changed, 62 insertions(+), 40 deletions(-) (limited to 'src/core/NEON/kernels') diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp index 568e5d1098..238a703708 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp @@ -57,13 +57,12 @@ void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpane "movi v12.4s, #0\n" "ldr q2, [%[a_ptr], #0x20]\n" "movi v13.4s, #0\n" - "ldr q6, [%[b_ptr], #0x20]\n" "movi v14.4s, #0\n" - "add %[a_ptr], %[a_ptr], #0x40\n" "movi v15.4s, #0\n" - "add %[b_ptr], %[b_ptr], #0x40\n" "movi v16.4s, #0\n" + "add %[a_ptr], %[a_ptr], #0x40\n" "movi v17.4s, #0\n" + "add %[b_ptr], %[b_ptr], #0x40\n" "movi v18.4s, #0\n" "movi v19.4s, #0\n" "movi v20.4s, #0\n" @@ -80,88 +79,111 @@ void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpane "movi v31.4s, #0\n" "cbz %[loops], 1f\n" "2:\n" + "ldr q3, [%[a_ptr], #-0x10]\n" ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" - "ldr q7, [%[b_ptr], #-0x10]\n" + ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" + + "ldp q6, q7, [%[b_ptr], #-0x20]\n" ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" - "ldr q3, [%[a_ptr], #-0x10]\n" + + ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n" ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" + + ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n" "subs %[loops], %[loops], #0x1\n" - ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" + ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" - "ldr q4, [%[b_ptr]]\n" - ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n" - ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" - "ldr q5, [%[b_ptr], #0x10]\n" + + "ldp q4, q5, [%[b_ptr]]\n" ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n" + ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n" ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n" + ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n" - "ldr q6, [%[b_ptr], #0x20]\n" ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n" + ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n" ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n" + ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n" - "ldr q7, [%[b_ptr], #0x30]\n" + "ldp q6, q7, [%[b_ptr], #0x20]\n" + ".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n" - ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n" - ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n" - ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" - "ldr q4, [%[b_ptr], #0x40]\n" ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n" "ldr q0, [%[a_ptr]]\n" + + ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n" ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n" "ldr q1, [%[a_ptr], #0x10]\n" + + ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n" ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n" "ldr q2, [%[a_ptr], #0x20]\n" + + ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n" ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n" - "ldr q5, [%[b_ptr], #0x50]\n" - ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" "ldr q3, [%[a_ptr], #0x30]\n" - ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" - "add %[a_ptr], %[a_ptr], #0x80\n" - ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" - "add %[b_ptr], %[b_ptr], #0xc0\n" - ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n" - "ldr q6, [%[b_ptr], #-0x60]\n" + + ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n" ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n" + + "ldp q4, q5, [%[b_ptr], #0x40]\n" + ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n" + ".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n" + ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n" + "add %[a_ptr], %[a_ptr], #0x80\n" + ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n" + ".inst 0x6e87a47b // ummla v27.4s, v3.16b, v7.16b\n" - "ldr q7, [%[b_ptr], #-0x50]\n" + "add %[b_ptr], %[b_ptr], #0xc0\n" + "ldp q6, q7, [%[b_ptr], #-0x60]\n" + ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n" ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n" + ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n" ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n" - "ldr q4, [%[b_ptr], #-0x40]\n" + ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n" ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n" + ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n" ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n" - "ldr q5, [%[b_ptr], #-0x30]\n" + + "ldp q4, q5, [%[b_ptr], #-0x40]\n" ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n" - ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n" - ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n" - ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n" - "ldr q6, [%[b_ptr], #-0x20]\n" + ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n" "ldr q0, [%[a_ptr], #-0x40]\n" + ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n" + ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n" "ldr q1, [%[a_ptr], #-0x30]\n" + ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n" + ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n" "ldr q2, [%[a_ptr], #-0x20]\n" + ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n" + ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n" "b.ne 2b\n" + "1:\n" + "ldr q3, [%[a_ptr], #-0x10]\n" "cbz %[tails], 3f\n" ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" - "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" - "ldr q3, [%[a_ptr], #-0x10]\n" + "ldr q6, [%[b_ptr], #-0x20]\n" ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" + ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" + "ldr q7, [%[b_ptr], #-0x10]\n" ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n" - ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" "ldr q4, [%[b_ptr]]\n" ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" @@ -267,16 +289,16 @@ void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpane "str q2, [%[c_ptr], #0x20]\n" "b 4f\n" "3:\n" - ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" + "ldr q6, [%[b_ptr], #-0x20]\n" "ldr q7, [%[b_ptr], #-0x10]\n" - ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" - "ldr q3, [%[a_ptr], #-0x10]\n" - ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" + ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n" "add %[a_ptr], %[a_ptr], #0x40\n" - ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" + ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n" "add %[b_ptr], %[b_ptr], #0x80\n" + ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n" ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n" "ldr q4, [%[b_ptr], #-0x80]\n" + ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n" ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n" ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n" ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n" @@ -392,4 +414,4 @@ void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpane } // namespace arm_gemm -#endif // __aarch64__ +#endif // __aarch64__ \ No newline at end of file -- cgit v1.2.1