From 94672fb2af6535adc6ea7fe8b8498580ad8cf3f4 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 22 Jan 2020 18:36:27 +0000 Subject: COMPMID-3003: Integrate assembly kernels utilizing MMLA instruction. MMLA is a matrix-multiply instruction introduced on armv8.6-A Signed-off-by: Georgios Pinitas Change-Id: I572a54981d48f5a1e0e9e51102cb7ae28ad87806 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2663 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- .../arm_gemm/merges/a64_merge_fp16_24x8.hpp | 45 ++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp') diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp index 7bfab412ca..f82e7b4e47 100644 --- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp +++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp @@ -140,6 +140,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -214,6 +217,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -311,6 +317,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -430,6 +439,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -572,6 +584,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -737,6 +752,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -926,6 +944,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[outptr0]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1133,6 +1154,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1184,6 +1208,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1255,6 +1282,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1346,6 +1376,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1456,6 +1489,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1586,6 +1622,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1736,6 +1775,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" @@ -1907,6 +1949,9 @@ void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, } else { /* Optimized routine to copy an entire block */ __asm __volatile ( +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ".arch armv8.2-a+fp16\n" +#endif "dup v0.8h, %[maxval].h[0]\n" "ldr q2, [%[biasptr]]\n" "dup v1.8h, %[minval].h[0]\n" -- cgit v1.2.1