1 files changed, 20 insertions, 18 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
index 8ce6a601fd..7ffae524dc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,13 +57,11 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
                 "movi v12.4s, #0\n"
                 "ldr q2, [%[a_ptr], #0x20]\n"
                 "movi v13.4s, #0\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
                 "movi v14.4s, #0\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
+                "add %[b_ptr], %[b_ptr], #0x30\n"
                 "movi v15.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
                 "movi v16.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
                 "movi v17.4s, #0\n"
                 "movi v18.4s, #0\n"
                 "movi v19.4s, #0\n"
@@ -82,9 +80,11 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
                 "cbz %[loops], 1f\n"
                 "2:\n"
                 ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "subs %[loops], %[loops], #0x1\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
+                "subs %[loops], %[loops], #0x1\n"
                 ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
                 ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
                 ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
@@ -140,13 +140,13 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
                 ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
                 ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
                 ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
                 "b.ne 2b\n"
                 "1:\n"
                 "cbz %[tails], 3f\n"
                 ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
                 ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
                 ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
@@ -178,12 +178,13 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
                 ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
                 "add %[a_ptr], %[a_ptr], #0x20\n"
                 ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x60\n"
                 ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
                 ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
                 ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
                 ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
                 ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n"
-                "ldr q4, [%[b_ptr], #0x30]\n"
+                "ldr q4, [%[b_ptr], #-0x30]\n"
                 ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n"
                 ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n"
                 ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n"
@@ -192,7 +193,7 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
                 ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n"
                 ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n"
                 ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n"
-                "ldr q5, [%[b_ptr], #0x40]\n"
+                "ldr q5, [%[b_ptr], #-0x20]\n"
                 ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
                 ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n"
                 ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
@@ -201,13 +202,12 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
                 ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
                 ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
                 ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "ldr q6, [%[b_ptr], #0x50]\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
                 ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
                 ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                "str q8, [%[c_ptr]]\n"
                 ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
                 ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
                 ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
                 ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
@@ -234,14 +234,17 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
                 "b 4f\n"
                 "3:\n"
                 ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x30\n"
                 ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
                 ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
                 ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
                 ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
                 ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr]]\n"
+                "ldr q4, [%[b_ptr], #-0x30]\n"
                 ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
                 ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
                 ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
@@ -250,7 +253,7 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
                 ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
                 ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
                 ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
+                "ldr q5, [%[b_ptr], #-0x20]\n"
                 ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
                 ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
                 ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
@@ -259,13 +262,12 @@ void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *B
                 ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
                 ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
                 ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
                 ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
                 ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
-                "str q8, [%[c_ptr]]\n"
                 ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
                 ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
                 ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
                 ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"