aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2018-03-20 16:46:55 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:49:16 +0000
commit99ef8407cd5b27fdec6f8dfaf8b55f820b6dea71 (patch)
tree7d7448ebc71d20c15611076375eb0cbb22f83f5a /src/core/NEON/kernels/arm_gemm/kernels
parent2d9de0a3fa6ad858e70040124f362799a962bb6a (diff)
downloadComputeLibrary-99ef8407cd5b27fdec6f8dfaf8b55f820b6dea71.tar.gz
COMPMID-881: Updated arm_gemm to the lastest
Change-Id: Iba2664f33320e79bd15ca9c1399e65e4cc165be6 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125265 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp147
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp239
5 files changed, 271 insertions, 131 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
index 77ec59aa35..5fc0a7b707 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
@@ -23,7 +23,7 @@
*/
#pragma once
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
#include "arm_gemm.hpp"
@@ -71,4 +71,4 @@ public:
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
index d59618dd54..2186117536 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
#include <arm_neon.h>
@@ -357,4 +359,4 @@ void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp
} // namespace arm_gemm
-#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
index 468d603484..65a5d43d1d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
#include <arm_neon.h>
@@ -334,4 +336,4 @@ void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
} // namespace arm_gemm
-#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
index 1b5787ce7c..8d4a38c36d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
@@ -31,8 +31,9 @@ namespace arm_gemm
{
void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K)
{
- int oddk = (K % 8) ? 1 : 0;
- int beta0 = (beta == 0.0f) ? 1 : 0;
+ const int oddk = ((K % 8) >= 4) ? 1 : 0;
+ const int beta0 = (beta == 0.0f) ? 1 : 0;
+ const int oddones = (K % 4);
/* For now, very naive with no blocking */
for(int y = 0; y < M; y += 4)
@@ -52,6 +53,7 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
float *c_ptr3 = c_ptr2 + ldc;
int loops = ((K + 4) / 8) - 1;
+ int odds = oddones;
size_t ldbb = ldb * sizeof(float);
@@ -434,14 +436,17 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
"ldr b0aq, [%[b_ptr]]\n"
"fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
"fmla v21.4s, b1a.4s, a1.s[1]\n"
- "subs %w[loops], %w[loops], #1\n"
+ "add %[a_ptr1], %[a_ptr1], #32\n"
"fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "add %[a_ptr2], %[a_ptr2], #32\n"
"fmla v29.4s, b1a.4s, a3.s[1]\n"
"ldr b1aq, [%[b_ptr], #16]\n"
"fmla v18.4s, b2a.4s, a0.s[1]\n"
"fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "add %[a_ptr3], %[a_ptr3], #32\n"
"fmla v26.4s, b2a.4s, a2.s[1]\n"
"fmla v30.4s, b2a.4s, a3.s[1]\n"
"ldr b2aq, [%[b_ptr], #32]\n"
@@ -488,7 +493,6 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
"fmla v17.4s, b1a.4s, a0.s[3]\n"
"fmla v21.4s, b1a.4s, a1.s[3]\n"
- "ldr a3aq, [%[a_ptr3], #16]\n"
"fmla v25.4s, b1a.4s, a2.s[3]\n"
"fmla v29.4s, b1a.4s, a3.s[3]\n"
"ldr b1aq, [%[b_ptr], #16]\n"
@@ -560,6 +564,7 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
// Unroll 6
"fmla v16.4s, bb0.4s, a0a.s[2]\n"
"fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
"fmla v24.4s, bb0.4s, a2a.s[2]\n"
"fmla v28.4s, bb0.4s, a3a.s[2]\n"
@@ -583,6 +588,7 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
"fmla v17.4s, b1a.4s, a0a.s[3]\n"
"fmla v18.4s, b2a.4s, a0a.s[3]\n"
"fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "cbnz %w[odds], 6f\n"
"fmla v20.4s, b0a.4s, a1a.s[3]\n"
"str q16, [%[c_ptr0]]\n"
@@ -615,12 +621,16 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
// Odd K case: Just do 4 more.
"2:\n"
"fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #16\n"
"fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "add %[a_ptr1], %[a_ptr1], #16\n"
"fmla v29.4s, bb1.4s, a3.s[0]\n"
"ldr b1q, [%[b_ptr], #16]\n"
"fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "add %[a_ptr2], %[a_ptr2], #16\n"
"fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "add %[a_ptr3], %[a_ptr3], #16\n"
"fmla v26.4s, bb2.4s, a2.s[0]\n"
"fmla v30.4s, bb2.4s, a3.s[0]\n"
"ldr b2q, [%[b_ptr], #32]\n"
@@ -641,7 +651,6 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
"fmla v17.4s, b1a.4s, a0.s[1]\n"
"fmla v21.4s, b1a.4s, a1.s[1]\n"
- "subs %w[loops], %w[loops], #1\n"
"fmla v25.4s, b1a.4s, a2.s[1]\n"
"fmla v29.4s, b1a.4s, a3.s[1]\n"
"ldr b1aq, [%[b_ptr], #16]\n"
@@ -660,6 +669,7 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
// Unroll 2
"fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
"fmla v20.4s, bb0.4s, a1.s[2]\n"
"fmla v24.4s, bb0.4s, a2.s[2]\n"
"fmla v28.4s, bb0.4s, a3.s[2]\n"
@@ -684,6 +694,7 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
"fmla v17.4s, b1a.4s, a0.s[3]\n"
"fmla v18.4s, b2a.4s, a0.s[3]\n"
"fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "cbnz %w[odds], 7f\n"
"fmla v20.4s, b0a.4s, a1.s[3]\n"
"str q16, [%[c_ptr0]]\n"
@@ -711,6 +722,130 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
"str q26, [%[c_ptr2], #32]\n"
"fmla v31.4s, b3a.4s, a3.s[3]\n"
"str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // "Odd ones" - lead in from even
+ "6:\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+ "b 8f\n"
+
+ // "Odd ones" - lead in from odd
+ "7:\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "beq 9f\n"
+
+ // "Odd ones" - loop
+ "8:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "subs %w[odds], %w[odds], #1\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "ld1r {a0.4s}, [%[a_ptr0]], #4\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "ld1r {a1.4s}, [%[a_ptr1]], #4\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "fmla v16.4s, bb0.4s, a0.4s\n"
+ "bne 8b\n"
+
+ // "Odd ones" - detached final iteration
+ "9:\n"
+ "fmla v17.4s, bb1.4s, a0.4s\n"
+ "ld1r {a2.4s}, [%[a_ptr2]], #4\n"
+ "fmla v18.4s, bb2.4s, a0.4s\n"
+ "fmla v19.4s, bb3.4s, a0.4s\n"
+ "ld1r {a3.4s}, [%[a_ptr3]], #4\n"
+
+ "fmla v20.4s, bb0.4s, a1.4s\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, bb1.4s, a1.4s\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, bb2.4s, a1.4s\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, bb3.4s, a1.4s\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, bb0.4s, a2.4s\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2.4s\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.4s\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, bb3.4s, a2.4s\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, bb0.4s, a3.4s\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, bb1.4s, a3.4s\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, bb2.4s, a3.4s\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, bb3.4s, a3.4s\n"
+ "str q27, [%[c_ptr2], #48]\n"
"3:\n"
"str q28, [%[c_ptr3]]\n"
@@ -719,7 +854,7 @@ void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, flo
"str q31, [%[c_ptr3], #48]\n"
: [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1), [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3),
- [b_ptr] "+r"(b_ptr), [loops] "+r"(loops)
+ [b_ptr] "+r"(b_ptr), [loops] "+r"(loops), [odds] "+r"(odds)
: [ldb] "r"(ldbb), [oddk] "r"(oddk), [beta0] "r"(beta0), [betaptr] "r"(&beta),
[c_ptr0] "r"(c_ptr0), [c_ptr1] "r"(c_ptr1), [c_ptr2] "r"(c_ptr2), [c_ptr3] "r"(c_ptr3)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
index 3309baff3a..8fa403bf02 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
@@ -50,12 +50,12 @@
namespace arm_gemm
{
-void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N)
+void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float beta, int lda, int M, int N)
{
const float *a_ptr_base = Astart;
float *y_ptr = Ystart;
- register const float32x4_t va asm("v1") = vdupq_n_f32(alpha);
+ register const float32x4_t vb asm("v1") = vdupq_n_f32(beta);
int firstpfd = FIRST_PFD;
if(firstpfd > M)
@@ -344,6 +344,7 @@ void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, fl
"fmla v17.4s, v5.4s, v0.4s\n"
"ldr q5, [%[a_ptr], #0xf0]\n"
"fmla v18.4s, v6.4s, v0.4s\n"
+
"ldr q6, [%[a_ptr], #0x100]\n"
"fmla v19.4s, v7.4s, v0.4s\n"
"ldr q7, [%[a_ptr], #0x110]\n"
@@ -372,75 +373,75 @@ void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, fl
"fmla v31.4s, v7.4s, v0.4s\n"
"ldr q7, [%[y_ptr], #0x50]\n"
- "fmla v2.4s, v8.4s, %[va].4s\n"
- "ldr q8, [%[y_ptr], #0x60]\n"
- "fmla v3.4s, v9.4s, %[va].4s\n"
- "ldr q9, [%[y_ptr], #0x70]\n"
- "fmla v4.4s, v10.4s, %[va].4s\n"
- "ldr q10, [%[y_ptr], #0x80]\n"
- "fmla v5.4s, v11.4s, %[va].4s\n"
- "ldr q11, [%[y_ptr], #0x90]\n"
- "fmla v6.4s, v12.4s, %[va].4s\n"
- "ldr q12, [%[y_ptr], #0xa0]\n"
- "str q2, [%[y_ptr], #0x00]\n"
- "fmla v7.4s, v13.4s, %[va].4s\n"
- "ldr q13, [%[y_ptr], #0xb0]\n"
- "str q3, [%[y_ptr], #0x10]\n"
- "fmla v8.4s, v14.4s, %[va].4s\n"
- "ldr q14, [%[y_ptr], #0xc0]\n"
- "str q4, [%[y_ptr], #0x20]\n"
- "fmla v9.4s, v15.4s, %[va].4s\n"
- "ldr q15, [%[y_ptr], #0xd0]\n"
- "str q5, [%[y_ptr], #0x30]\n"
- "fmla v10.4s, v16.4s, %[va].4s\n"
- "ldr q16, [%[y_ptr], #0xe0]\n"
- "str q6, [%[y_ptr], #0x40]\n"
- "fmla v11.4s, v17.4s, %[va].4s\n"
- "ldr q17, [%[y_ptr], #0xf0]\n"
- "str q7, [%[y_ptr], #0x50]\n"
- "fmla v12.4s, v18.4s, %[va].4s\n"
- "ldr q18, [%[y_ptr], #0x100]\n"
- "str q8, [%[y_ptr], #0x60]\n"
- "fmla v13.4s, v19.4s, %[va].4s\n"
- "ldr q19, [%[y_ptr], #0x110]\n"
- "str q9, [%[y_ptr], #0x70]\n"
- "fmla v14.4s, v20.4s, %[va].4s\n"
- "ldr q20, [%[y_ptr], #0x120]\n"
- "str q10, [%[y_ptr], #0x80]\n"
- "fmla v15.4s, v21.4s, %[va].4s\n"
- "ldr q21, [%[y_ptr], #0x130]\n"
- "str q11, [%[y_ptr], #0x90]\n"
- "fmla v16.4s, v22.4s, %[va].4s\n"
- "ldr q22, [%[y_ptr], #0x140]\n"
- "str q12, [%[y_ptr], #0xa0]\n"
- "fmla v17.4s, v23.4s, %[va].4s\n"
- "ldr q23, [%[y_ptr], #0x150]\n"
- "str q13, [%[y_ptr], #0xb0]\n"
- "fmla v18.4s, v24.4s, %[va].4s\n"
- "ldr q24, [%[y_ptr], #0x160]\n"
- "str q14, [%[y_ptr], #0xc0]\n"
- "fmla v19.4s, v25.4s, %[va].4s\n"
- "ldr q25, [%[y_ptr], #0x170]\n"
- "str q15, [%[y_ptr], #0xd0]\n"
- "fmla v20.4s, v26.4s, %[va].4s\n"
- "str q16, [%[y_ptr], #0xe0]\n"
- "fmla v21.4s, v27.4s, %[va].4s\n"
- "str q17, [%[y_ptr], #0xf0]\n"
- "fmla v22.4s, v28.4s, %[va].4s\n"
- "str q18, [%[y_ptr], #0x100]\n"
- "fmla v23.4s, v29.4s, %[va].4s\n"
- "str q19, [%[y_ptr], #0x110]\n"
- "fmla v24.4s, v30.4s, %[va].4s\n"
- "str q20, [%[y_ptr], #0x120]\n"
- "fmla v25.4s, v31.4s, %[va].4s\n"
- "str q21, [%[y_ptr], #0x130]\n"
-
- "stp q22, q23, [%[y_ptr], #0x140]\n"
- "stp q24, q25, [%[y_ptr], #0x160]\n"
+ "fmla v8.4s, v2.4s, %[vb].4s\n"
+ "ldr q2, [%[y_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, %[vb].4s\n"
+ "ldr q3, [%[y_ptr], #0x70]\n"
+ "fmla v10.4s, v4.4s, %[vb].4s\n"
+ "ldr q4, [%[y_ptr], #0x80]\n"
+ "fmla v11.4s, v5.4s, %[vb].4s\n"
+ "ldr q5, [%[y_ptr], #0x90]\n"
+ "fmla v12.4s, v6.4s, %[vb].4s\n"
+ "ldr q6, [%[y_ptr], #0xa0]\n"
+ "str q8, [%[y_ptr], #0x00]\n"
+ "fmla v13.4s, v7.4s, %[vb].4s\n"
+ "ldr q7, [%[y_ptr], #0xb0]\n"
+ "str q9, [%[y_ptr], #0x10]\n"
+ "fmla v14.4s, v2.4s, %[vb].4s\n"
+ "ldr q2, [%[y_ptr], #0xc0]\n"
+ "str q10, [%[y_ptr], #0x20]\n"
+ "fmla v15.4s, v3.4s, %[vb].4s\n"
+ "ldr q3, [%[y_ptr], #0xd0]\n"
+ "str q11, [%[y_ptr], #0x30]\n"
+ "fmla v16.4s, v4.4s, %[vb].4s\n"
+ "ldr q4, [%[y_ptr], #0xe0]\n"
+ "str q12, [%[y_ptr], #0x40]\n"
+ "fmla v17.4s, v5.4s, %[vb].4s\n"
+ "ldr q5, [%[y_ptr], #0xf0]\n"
+ "str q13, [%[y_ptr], #0x50]\n"
+ "fmla v18.4s, v6.4s, %[vb].4s\n"
+ "ldr q6, [%[y_ptr], #0x100]\n"
+ "str q14, [%[y_ptr], #0x60]\n"
+ "fmla v19.4s, v7.4s, %[vb].4s\n"
+ "ldr q7, [%[y_ptr], #0x110]\n"
+ "str q15, [%[y_ptr], #0x70]\n"
+ "fmla v20.4s, v2.4s, %[vb].4s\n"
+ "ldr q2, [%[y_ptr], #0x120]\n"
+ "str q16, [%[y_ptr], #0x80]\n"
+ "fmla v21.4s, v3.4s, %[vb].4s\n"
+ "ldr q3, [%[y_ptr], #0x130]\n"
+ "str q17, [%[y_ptr], #0x90]\n"
+ "fmla v22.4s, v4.4s, %[vb].4s\n"
+ "ldr q4, [%[y_ptr], #0x140]\n"
+ "str q18, [%[y_ptr], #0xa0]\n"
+ "fmla v23.4s, v5.4s, %[vb].4s\n"
+ "ldr q5, [%[y_ptr], #0x150]\n"
+ "str q19, [%[y_ptr], #0xb0]\n"
+ "fmla v24.4s, v6.4s, %[vb].4s\n"
+ "ldr q6, [%[y_ptr], #0x160]\n"
+ "str q20, [%[y_ptr], #0xc0]\n"
+ "fmla v25.4s, v7.4s, %[vb].4s\n"
+ "ldr q7, [%[y_ptr], #0x170]\n"
+ "str q21, [%[y_ptr], #0xd0]\n"
+ "fmla v26.4s, v2.4s, %[vb].4s\n"
+ "str q22, [%[y_ptr], #0xe0]\n"
+ "fmla v27.4s, v3.4s, %[vb].4s\n"
+ "str q23, [%[y_ptr], #0xf0]\n"
+ "fmla v28.4s, v4.4s, %[vb].4s\n"
+ "str q24, [%[y_ptr], #0x100]\n"
+ "fmla v29.4s, v5.4s, %[vb].4s\n"
+ "str q25, [%[y_ptr], #0x110]\n"
+ "fmla v30.4s, v6.4s, %[vb].4s\n"
+ "str q26, [%[y_ptr], #0x120]\n"
+ "fmla v31.4s, v7.4s, %[vb].4s\n"
+ "str q27, [%[y_ptr], #0x130]\n"
+
+ "stp q28, q29, [%[y_ptr], #0x140]\n"
+ "stp q30, q31, [%[y_ptr], #0x160]\n"
"add %[y_ptr], %[y_ptr], #0x180\n"
: [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k), [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr)
- : [jump] "r"(jump), [va] "w"(va), [pf_limit] "r"(pf_limit)
+ : [jump] "r"(jump), [vb] "w"(vb), [pf_limit] "r"(pf_limit)
: "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31", "cc");
@@ -747,161 +748,161 @@ void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, fl
// Vector 0
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v8.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v8.4s, v7.4s, %[vb].4s\n"
+ "str q8, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 1
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v9.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v9.4s, v7.4s, %[vb].4s\n"
+ "str q9, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 2
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v10.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v10.4s, v7.4s, %[vb].4s\n"
+ "str q10, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 3
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v11.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v11.4s, v7.4s, %[vb].4s\n"
+ "str q11, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 4
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v12.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v12.4s, v7.4s, %[vb].4s\n"
+ "str q12, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 5
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v13.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v13.4s, v7.4s, %[vb].4s\n"
+ "str q13, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 6
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v14.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v14.4s, v7.4s, %[vb].4s\n"
+ "str q14, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 7
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v15.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v15.4s, v7.4s, %[vb].4s\n"
+ "str q15, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 8
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v16.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v16.4s, v7.4s, %[vb].4s\n"
+ "str q16, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 9
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v17.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v17.4s, v7.4s, %[vb].4s\n"
+ "str q17, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 10
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v18.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v18.4s, v7.4s, %[vb].4s\n"
+ "str q18, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 11
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v19.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v19.4s, v7.4s, %[vb].4s\n"
+ "str q19, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 12
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v20.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v20.4s, v7.4s, %[vb].4s\n"
+ "str q20, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 13
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v21.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v21.4s, v7.4s, %[vb].4s\n"
+ "str q21, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 14
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v22.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v22.4s, v7.4s, %[vb].4s\n"
+ "str q22, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 15
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v23.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v23.4s, v7.4s, %[vb].4s\n"
+ "str q23, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 16
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v24.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v24.4s, v7.4s, %[vb].4s\n"
+ "str q24, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 17
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v25.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v25.4s, v7.4s, %[vb].4s\n"
+ "str q25, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 18
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v26.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v26.4s, v7.4s, %[vb].4s\n"
+ "str q26, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 19
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v27.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v27.4s, v7.4s, %[vb].4s\n"
+ "str q27, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 20
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v28.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v28.4s, v7.4s, %[vb].4s\n"
+ "str q28, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 21
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v29.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v29.4s, v7.4s, %[vb].4s\n"
+ "str q29, [%[y_ptr]], #0x10\n"
"beq 12f\n"
// Vector 22
"subs %w[vecs], %w[vecs], #1\n"
"ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v30.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
+ "fmla v30.4s, v7.4s, %[vb].4s\n"
+ "str q30, [%[y_ptr]], #0x10\n"
// Odd 2
"12:\n"
"cbz %[odd2_aptr], 13f\n"
"ldr d7, [%[y_ptr]]\n"
- "fmla v7.2s, v6.2s, %[va].2s\n"
- "str d7, [%[y_ptr]], #0x8\n"
+ "fmla v6.2s, v7.2s, %[vb].2s\n"
+ "str d6, [%[y_ptr]], #0x8\n"
// Odd 1
"13:\n"
"cbz %[odd1_aptr], 14f\n"
"ldr s7, [%[y_ptr]]\n"
- "fmla v7.2s, v5.2s, %[va].2s\n"
- "str s7, [%[y_ptr]]\n"
+ "fmla v5.2s, v7.2s, %[vb].2s\n"
+ "str s5, [%[y_ptr]]\n"
"14:\n"
: [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k),
[pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr),
[odd1_aptr] "+r"(odd1_aptr), [odd2_aptr] "+r"(odd2_aptr),
[dopf] "+r"(dopf), [vecs] "+r"(vecs)
- : [jump] "r"(jump), [va] "w"(va), [pf_limit] "r"(pf_limit), [numvecs] "r"(numvecs)
+ : [jump] "r"(jump), [vb] "w"(vb), [pf_limit] "r"(pf_limit), [numvecs] "r"(numvecs)
: "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31", "cc");