aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2021-03-26 14:47:24 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-03-29 19:50:18 +0000
commita00180a7a5e957540493d666565eb39c0cd1f122 (patch)
tree1da92febf7ea233c72fe8b49dbe0154a58fff96c /src/core/NEON
parent702dc0c71f2b2830b63e3b4079ede0ef76377f0a (diff)
downloadComputeLibrary-a00180a7a5e957540493d666565eb39c0cd1f122.tar.gz
Update U8 mmla kernel
Change-Id: Ia12a8761ea77a467f3382d06bb222fe5f165925e Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5333 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/NEON')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp102
1 files changed, 62 insertions, 40 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index 568e5d1098..238a703708 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -57,13 +57,12 @@ void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpane
"movi v12.4s, #0\n"
"ldr q2, [%[a_ptr], #0x20]\n"
"movi v13.4s, #0\n"
- "ldr q6, [%[b_ptr], #0x20]\n"
"movi v14.4s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v15.4s, #0\n"
- "add %[b_ptr], %[b_ptr], #0x40\n"
"movi v16.4s, #0\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v17.4s, #0\n"
+ "add %[b_ptr], %[b_ptr], #0x40\n"
"movi v18.4s, #0\n"
"movi v19.4s, #0\n"
"movi v20.4s, #0\n"
@@ -80,88 +79,111 @@ void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpane
"movi v31.4s, #0\n"
"cbz %[loops], 1f\n"
"2:\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
- "ldr q7, [%[b_ptr], #-0x10]\n"
+ ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
+
+ "ldp q6, q7, [%[b_ptr], #-0x20]\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
- "ldr q3, [%[a_ptr], #-0x10]\n"
+
+ ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+
+ ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
"subs %[loops], %[loops], #0x1\n"
- ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
+
".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
- "ldr q4, [%[b_ptr]]\n"
- ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
- ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
- "ldr q5, [%[b_ptr], #0x10]\n"
+
+ "ldp q4, q5, [%[b_ptr]]\n"
".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
+
".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
+
".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
- "ldr q6, [%[b_ptr], #0x20]\n"
".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+
".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
+
".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
- "ldr q7, [%[b_ptr], #0x30]\n"
+ "ldp q6, q7, [%[b_ptr], #0x20]\n"
+
".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n"
- ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
- ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
- ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
- "ldr q4, [%[b_ptr], #0x40]\n"
".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
"ldr q0, [%[a_ptr]]\n"
+
+ ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
"ldr q1, [%[a_ptr], #0x10]\n"
+
+ ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
"ldr q2, [%[a_ptr], #0x20]\n"
+
+ ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
- "ldr q5, [%[b_ptr], #0x50]\n"
- ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
"ldr q3, [%[a_ptr], #0x30]\n"
- ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
- "add %[a_ptr], %[a_ptr], #0x80\n"
- ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
- "add %[b_ptr], %[b_ptr], #0xc0\n"
- ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
- "ldr q6, [%[b_ptr], #-0x60]\n"
+
+ ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
+
+ "ldp q4, q5, [%[b_ptr], #0x40]\n"
+ ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
+
".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
+
".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
+ "add %[a_ptr], %[a_ptr], #0x80\n"
+ ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
+
".inst 0x6e87a47b // ummla v27.4s, v3.16b, v7.16b\n"
- "ldr q7, [%[b_ptr], #-0x50]\n"
+ "add %[b_ptr], %[b_ptr], #0xc0\n"
+ "ldp q6, q7, [%[b_ptr], #-0x60]\n"
+
".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
+
".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
- "ldr q4, [%[b_ptr], #-0x40]\n"
+
".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
+
".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
- "ldr q5, [%[b_ptr], #-0x30]\n"
+
+ "ldp q4, q5, [%[b_ptr], #-0x40]\n"
".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
- ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n"
- ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n"
- ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
- "ldr q6, [%[b_ptr], #-0x20]\n"
+
".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
"ldr q0, [%[a_ptr], #-0x40]\n"
+ ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n"
+
".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
"ldr q1, [%[a_ptr], #-0x30]\n"
+ ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n"
+
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
"ldr q2, [%[a_ptr], #-0x20]\n"
+ ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
+
".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
"b.ne 2b\n"
+
"1:\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
"cbz %[tails], 3f\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
- "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
- "ldr q3, [%[a_ptr], #-0x10]\n"
+ "ldr q6, [%[b_ptr], #-0x20]\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
- ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
"ldr q4, [%[b_ptr]]\n"
".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
@@ -267,16 +289,16 @@ void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpane
"str q2, [%[c_ptr], #0x20]\n"
"b 4f\n"
"3:\n"
- ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
+ "ldr q6, [%[b_ptr], #-0x20]\n"
"ldr q7, [%[b_ptr], #-0x10]\n"
- ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
- "ldr q3, [%[a_ptr], #-0x10]\n"
- ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
"add %[a_ptr], %[a_ptr], #0x40\n"
- ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
"add %[b_ptr], %[b_ptr], #0x80\n"
+ ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
"ldr q4, [%[b_ptr], #-0x80]\n"
+ ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
@@ -392,4 +414,4 @@ void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpane
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__ \ No newline at end of file