aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp30
1 files changed, 16 insertions, 14 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
index e182a425f4..dcd15f0345 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,13 +59,11 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane
"movi v13.4s, #0\n"
"ldr q6, [%[b_ptr], #0x20]\n"
"movi v14.4s, #0\n"
- "ldr q3, [%[a_ptr], #0x30]\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v15.4s, #0\n"
- "ldr q7, [%[b_ptr], #0x30]\n"
+ "add %[b_ptr], %[b_ptr], #0x40\n"
"movi v16.4s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
"movi v17.4s, #0\n"
- "add %[b_ptr], %[b_ptr], #0x40\n"
"movi v18.4s, #0\n"
"movi v19.4s, #0\n"
"movi v20.4s, #0\n"
@@ -83,12 +81,14 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane
"cbz %[loops], 1f\n"
"2:\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
- "subs %[loops], %[loops], #0x1\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
"ldr q4, [%[b_ptr]]\n"
- ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
@@ -151,18 +151,18 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane
".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
"ldr q2, [%[a_ptr], #-0x20]\n"
".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
- "ldr q7, [%[b_ptr], #-0x10]\n"
- "ldr q3, [%[a_ptr], #-0x10]\n"
"b.ne 2b\n"
"1:\n"
"cbz %[tails], 3f\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
- ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
- "ldr q4, [%[b_ptr]]\n"
".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+ "ldr q4, [%[b_ptr]]\n"
".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
"ldr q5, [%[b_ptr], #0x10]\n"
@@ -268,13 +268,15 @@ void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpane
"b 4f\n"
"3:\n"
".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
+ "ldr q7, [%[b_ptr], #-0x10]\n"
".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
+ "ldr q3, [%[a_ptr], #-0x10]\n"
".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ "add %[a_ptr], %[a_ptr], #0x40\n"
+ ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
"ldr q4, [%[b_ptr], #-0x80]\n"
- ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"