COMPMID-3003: Integrate assembly kernels utilizing MMLA instruction.

MMLA is a matrix-multiply instruction introduced on armv8.6-A Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I572a54981d48f5a1e0e9e51102cb7ae28ad87806 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2663 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2020-01-22 18:36:27 +0000
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2020-01-31 11:36:14 +0000
commit: 94672fb2af6535adc6ea7fe8b8498580ad8cf3f4 (patch)
tree: 189c68e7ff2c29d7800e09239da6d9a6364d5b05 /src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
parent: 6a342648ae50beb8457871862f14fc9baef6b74f (diff)
download: ComputeLibrary-94672fb2af6535adc6ea7fe8b8498580ad8cf3f4.tar.gz
1 files changed, 45 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
index 7bfab412ca..f82e7b4e47 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
@@ -140,6 +140,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[outptr0]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -214,6 +217,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[outptr0]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -311,6 +317,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[outptr0]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -430,6 +439,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[outptr0]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -572,6 +584,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[outptr0]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -737,6 +752,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[outptr0]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -926,6 +944,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[outptr0]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -1133,6 +1154,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[biasptr]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -1184,6 +1208,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[biasptr]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -1255,6 +1282,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[biasptr]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -1346,6 +1376,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[biasptr]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -1456,6 +1489,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[biasptr]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -1586,6 +1622,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[biasptr]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -1736,6 +1775,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[biasptr]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
@@ -1907,6 +1949,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout,
                         } else {
                             /* Optimized routine to copy an entire block */
                             __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                                ".arch  armv8.2-a+fp16\n"
+#endif
                                 "dup v0.8h, %[maxval].h[0]\n"
                                 "ldr q2, [%[biasptr]]\n"
                                 "dup v1.8h, %[minval].h[0]\n"
author	Georgios Pinitas <georgios.pinitas@arm.com>	2020-01-22 18:36:27 +0000
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2020-01-31 11:36:14 +0000
commit	94672fb2af6535adc6ea7fe8b8498580ad8cf3f4 (patch)
tree	189c68e7ff2c29d7800e09239da6d9a6364d5b05 /src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
parent	6a342648ae50beb8457871862f14fc9baef6b74f (diff)
download	ComputeLibrary-94672fb2af6535adc6ea7fe8b8498580ad8cf3f4.tar.gz