aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm
diff options
context:
space:
mode:
authorramelg01 <ramy.elgammal@arm.com>2021-12-08 02:00:16 +0000
committerSheri Zhang <sheri.zhang@arm.com>2021-12-14 10:53:54 +0000
commitb6e49ebb5b83edbf45513978c0c77a8871c6b36c (patch)
tree411724910a6db7ae8d3451b457796ea6ec4a86a2 /src/core/NEON/kernels/arm_gemm
parent6f3a9f5f4ef6ec7aa8e91df3c1f373d95931dd7b (diff)
downloadComputeLibrary-b6e49ebb5b83edbf45513978c0c77a8871c6b36c.tar.gz
Update A510 arm_gemm cpu Kernels
Resolves: COMPMID-4910 Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com> Change-Id: I79b4aa51e07ad1fe81d9218ed8a8f34f0ec5ab06 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6803 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp24
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp275
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp275
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp275
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp2
25 files changed, 886 insertions, 36 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 758f2b1f8c..9af1b4df12 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -72,7 +72,7 @@ public:
return { 15.361, 0.9341, 0.1636 };
case CPUModel::V1:
- return { 62.40, 4.71, 0.67 };
+ return { 51.14, 7.38, 0.65 };
default:
return { 29.0698, 3.9793, 0.4003 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index 21c9f59661..6d333f3449 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -80,7 +80,7 @@ public:
return { 15.361, 0.9341, 0.1636 };
case CPUModel::V1:
- return { 62.40, 4.71, 0.67 };
+ return { 51.14, 7.38, 0.65 };
default:
return { 29.0698, 3.9793, 0.4003 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
index 090dd5855e..e6e7950979 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -82,7 +82,7 @@ public:
case CPUModel::A510:
return { 6.81 };
case CPUModel::V1:
- return { 28.40 };
+ return { 22.33 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
index f5e9009f6d..39ffcbef12 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -82,7 +82,7 @@ public:
case CPUModel::A510:
return { 6.70 };
case CPUModel::V1:
- return { 26.64 };
+ return { 21.28 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index 94f5783686..905a60265c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -84,7 +84,7 @@ public:
case CPUModel::A510:
return { 14.81 };
case CPUModel::V1:
- return { 48.34 };
+ return { 44.54 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
index bc933afd9a..69ea87bc9e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
@@ -81,7 +81,7 @@ public:
case CPUModel::A510:
return { 27.99 };
case CPUModel::V1:
- return { 68.76 };
+ return { 62.26 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index c5105a6d4a..ce96c1b28f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -75,29 +75,29 @@ public:
template<typename T>
static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
{
- if (std::is_same<T, uint8_t>::value) {
+ if (std::is_same<T, uint32_t>::value) {
switch (ci->get_cpu_model()) {
- case CPUModel::A55r1:
- return { 9.5238, 2.0799, 0.2279 };
default:
- return { 29.6736, 11.4025, 0.5591 };
+ return { 31.63 };
case CPUModel::A510:
- return { 16.65, 3.92, 0.48 };
+ return { 15.89 };
case CPUModel::V1:
- return { 55.42, 19.29, 0.92 };
+ return { 53.87 };
+ case CPUModel::A55r1:
+ return { 9.217 };
}
}
- if (std::is_same<T, uint32_t>::value) {
+ if (std::is_same<T, uint8_t>::value) {
switch (ci->get_cpu_model()) {
- default:
- return { 31.63 };
case CPUModel::A55r1:
- return { 9.217 };
+ return { 9.5238, 2.0799, 0.2279 };
+ default:
+ return { 29.6736, 11.4025, 0.5591 };
case CPUModel::A510:
- return { 15.89 };
+ return { 16.65, 3.92, 0.48 };
case CPUModel::V1:
- return { 53.87 };
+ return { 42.62, 16.32, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
index 24bad3c63e..b5cedc7e98 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
@@ -92,7 +92,7 @@ public:
case CPUModel::A510:
return { 33.64, 3.92, 0.48 };
case CPUModel::V1:
- return { 86.71, 19.00, 0.93 };
+ return { 63.94, 16.18, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 9b3517a802..6ec6bd2ed8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -36,6 +36,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void a64_interleaved_bf16fp32_mmla_8x12( ARGLIST );
+void a64_interleaved_bf16fp32_mmla_8x12_a510( ARGLIST );
class cls_a64_interleaved_bf16fp32_mmla_8x12
{
@@ -78,7 +79,7 @@ public:
default:
return { 31.54, 4.30, 7.33 };
case CPUModel::V1:
- return { 59.94, 5.08, 9.83 };
+ return { 41.44, 5.01, 5.64 };
case CPUModel::A510:
return { 7.82, 4.05, 3.07 };
}
@@ -101,8 +102,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
- cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
+ cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A510:
+ kernel=a64_interleaved_bf16fp32_mmla_8x12_a510;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
new file mode 100644
index 0000000000..0235e91bfe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void a64_interleaved_bf16fp32_mmla_8x12_a510(
+ const bfloat16 *Apanel, const bfloat16 *Bpanel,
+ float *Cpanel, int ablocks, int bblocks, int K) {
+
+ struct KernelArgs {
+ size_t bblocks = {};
+ size_t K = {};
+ const bfloat16 *Bpanel = {};
+ } ka;
+
+ ka.bblocks = bblocks;
+ ka.K = (K/4) - 1;
+ ka.Bpanel = Bpanel;
+
+ __asm__ __volatile__(
+
+ "1:" // Height loop
+ "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+ "mov x21, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "2:" // Width loop
+ "mov %x[Apanel], x21\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+ "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v4.8h }, [x20], #0x10\n"
+ "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+ "cmp x19, #0x2\n"
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "ld1 { v5.8h }, [x20], #0x10\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "blt 4f\n"
+ "3:" // main loop head
+ "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
+ "sub x19, x19, #0x2\n"
+ ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
+ "cmp x19, #0x2\n"
+ ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
+ "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n"
+ "ld1 { v6.8h }, [x20], #0x10\n"
+ ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
+ "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n"
+ "ld1 { v7.8h }, [x20], #0x10\n"
+ ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
+ "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
+ "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n"
+ "ld1 { v4.8h }, [x20], #0x10\n"
+ ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
+ "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x20], #0x10\n"
+ ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
+ "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ "bge 3b\n"
+ "4:" // main loop skip
+ "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e46ec7b // bfmmla v27.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e47ec32 // bfmmla v18.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e47ec58 // bfmmla v24.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e47ec7e // bfmmla v30.4s, v3.8h, v7.8h\n"
+ ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
+ "cbz x19, 5f\n"
+ "ld1 { v0.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.8h }, [x20], #0x10\n"
+ ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
+ "ld1 { v1.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v7.8h }, [x20], #0x10\n"
+ ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
+ "ld1 { v2.8h }, [%x[Apanel]], #0x10\n"
+ "ld1 { v3.8h }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e44ec09 // bfmmla v9.4s, v0.8h, v4.8h\n"
+ ".inst 0x6e44ec2f // bfmmla v15.4s, v1.8h, v4.8h\n"
+ ".inst 0x6e44ec55 // bfmmla v21.4s, v2.8h, v4.8h\n"
+ ".inst 0x6e44ec7b // bfmmla v27.4s, v3.8h, v4.8h\n"
+ ".inst 0x6e45ec0c // bfmmla v12.4s, v0.8h, v5.8h\n"
+ ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+ ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
+ ".inst 0x6e45ec32 // bfmmla v18.4s, v1.8h, v5.8h\n"
+ ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
+ ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
+ ".inst 0x6e45ec58 // bfmmla v24.4s, v2.8h, v5.8h\n"
+ ".inst 0x6e45ec7e // bfmmla v30.4s, v3.8h, v5.8h\n"
+ ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
+ ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
+ ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
+ ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
+ "5:" // multiply loop done
+ "subs x22, x22, #0x1\n"
+ "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp2 v8.2d, v8.2d, v11.2d\n"
+ "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q4, [%x[Cpanel], #0x0]\n"
+ "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "uzp2 v10.2d, v10.2d, v13.2d\n"
+ "str q11, [%x[Cpanel], #0x10]\n"
+ "str q12, [%x[Cpanel], #0x20]\n"
+ "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
+ "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp2 v15.2d, v15.2d, v18.2d\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
+ "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp2 v16.2d, v16.2d, v19.2d\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
+ "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp2 v20.2d, v20.2d, v23.2d\n"
+ "str q13, [%x[Cpanel], #0x60]\n"
+ "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "uzp2 v21.2d, v21.2d, v24.2d\n"
+ "str q17, [%x[Cpanel], #0x70]\n"
+ "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "uzp2 v22.2d, v22.2d, v25.2d\n"
+ "str q18, [%x[Cpanel], #0x80]\n"
+ "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "uzp2 v26.2d, v26.2d, v29.2d\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
+ "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp2 v27.2d, v27.2d, v30.2d\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
+ "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp2 v28.2d, v28.2d, v31.2d\n"
+ "str q16, [%x[Cpanel], #0xb0]\n"
+ "str q19, [%x[Cpanel], #0xc0]\n"
+ "str q23, [%x[Cpanel], #0xd0]\n"
+ "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q20, [%x[Cpanel], #0xf0]\n"
+ "str q21, [%x[Cpanel], #0x100]\n"
+ "str q22, [%x[Cpanel], #0x110]\n"
+ "str q25, [%x[Cpanel], #0x120]\n"
+ "str q29, [%x[Cpanel], #0x130]\n"
+ "str q30, [%x[Cpanel], #0x140]\n"
+ "str q26, [%x[Cpanel], #0x150]\n"
+ "str q27, [%x[Cpanel], #0x160]\n"
+ "str q28, [%x[Cpanel], #0x170]\n"
+ "add %x[Cpanel], %x[Cpanel], #0x180\n"
+ "bgt 2b\n"
+ "subs %x[ablocks], %x[ablocks], #0x1\n"
+ "bne 1b\n"
+ : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+ : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index ff69bc8f53..4cc3ed040a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -35,6 +35,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void a64_interleaved_s8s32_mmla_8x12( ARGLIST );
+void a64_interleaved_s8s32_mmla_8x12_a510( ARGLIST );
class cls_a64_interleaved_s8s32_mmla_8x12
{
@@ -91,7 +92,7 @@ public:
case CPUModel::A510:
return { 48.22, 2.49, 0.29 };
case CPUModel::V1:
- return { 116.76, 4.67, 0.60 };
+ return { 75.54, 8.06, 0.63 };
}
}
@@ -100,9 +101,17 @@ public:
// Default to the generic kernel
kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
- cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
+ cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A510:
+ kernel=a64_interleaved_s8s32_mmla_8x12_a510;
+ break;
+ }
}
+
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
new file mode 100644
index 0000000000..a4d8c0ace7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_s8s32_mmla_8x12_a510(
+ const int8_t *Apanel, const int8_t *Bpanel,
+ int32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+ struct KernelArgs {
+ size_t bblocks = {};
+ size_t K = {};
+ const int8_t *Bpanel = {};
+ } ka;
+
+ ka.bblocks = bblocks;
+ ka.K = (K/8) - 1;
+ ka.Bpanel = Bpanel;
+
+ __asm__ __volatile__(
+
+ "1:" // Height loop
+ "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+ "mov x21, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "2:" // Width loop
+ "mov %x[Apanel], x21\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v4.16b }, [x20], #0x10\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ "cmp x19, #0x2\n"
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "ld1 { v5.16b }, [x20], #0x10\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "blt 4f\n"
+ "3:" // main loop head
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
+ "sub x19, x19, #0x2\n"
+ ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
+ "cmp x19, #0x2\n"
+ ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n"
+ "ld1 { v6.16b }, [x20], #0x10\n"
+ ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n"
+ "ld1 { v7.16b }, [x20], #0x10\n"
+ ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n"
+ "ld1 { v4.16b }, [x20], #0x10\n"
+ ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n"
+ "ld1 { v5.16b }, [x20], #0x10\n"
+ ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
+ "bge 3b\n"
+ "4:" // main loop skip
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x4e86a409 // smmla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e86a42f // smmla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e86a455 // smmla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e86a47b // smmla v27.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a40c // smmla v12.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e87a432 // smmla v18.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e87a458 // smmla v24.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e87a47e // smmla v30.4s, v3.16b, v7.16b\n"
+ ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
+ "cbz x19, 5f\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.16b }, [x20], #0x10\n"
+ ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v7.16b }, [x20], #0x10\n"
+ ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x4e84a409 // smmla v9.4s, v0.16b, v4.16b\n"
+ ".inst 0x4e84a42f // smmla v15.4s, v1.16b, v4.16b\n"
+ ".inst 0x4e84a455 // smmla v21.4s, v2.16b, v4.16b\n"
+ ".inst 0x4e84a47b // smmla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x4e85a40c // smmla v12.4s, v0.16b, v5.16b\n"
+ ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
+ ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x4e85a432 // smmla v18.4s, v1.16b, v5.16b\n"
+ ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
+ ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
+ ".inst 0x4e85a458 // smmla v24.4s, v2.16b, v5.16b\n"
+ ".inst 0x4e85a47e // smmla v30.4s, v3.16b, v5.16b\n"
+ ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
+ ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
+ "5:" // multiply loop done
+ "subs x22, x22, #0x1\n"
+ "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp2 v8.2d, v8.2d, v11.2d\n"
+ "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q4, [%x[Cpanel], #0x0]\n"
+ "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "uzp2 v10.2d, v10.2d, v13.2d\n"
+ "str q11, [%x[Cpanel], #0x10]\n"
+ "str q12, [%x[Cpanel], #0x20]\n"
+ "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
+ "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp2 v15.2d, v15.2d, v18.2d\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
+ "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp2 v16.2d, v16.2d, v19.2d\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
+ "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp2 v20.2d, v20.2d, v23.2d\n"
+ "str q13, [%x[Cpanel], #0x60]\n"
+ "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "uzp2 v21.2d, v21.2d, v24.2d\n"
+ "str q17, [%x[Cpanel], #0x70]\n"
+ "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "uzp2 v22.2d, v22.2d, v25.2d\n"
+ "str q18, [%x[Cpanel], #0x80]\n"
+ "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "uzp2 v26.2d, v26.2d, v29.2d\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
+ "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp2 v27.2d, v27.2d, v30.2d\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
+ "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp2 v28.2d, v28.2d, v31.2d\n"
+ "str q16, [%x[Cpanel], #0xb0]\n"
+ "str q19, [%x[Cpanel], #0xc0]\n"
+ "str q23, [%x[Cpanel], #0xd0]\n"
+ "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q20, [%x[Cpanel], #0xf0]\n"
+ "str q21, [%x[Cpanel], #0x100]\n"
+ "str q22, [%x[Cpanel], #0x110]\n"
+ "str q25, [%x[Cpanel], #0x120]\n"
+ "str q29, [%x[Cpanel], #0x130]\n"
+ "str q30, [%x[Cpanel], #0x140]\n"
+ "str q26, [%x[Cpanel], #0x150]\n"
+ "str q27, [%x[Cpanel], #0x160]\n"
+ "str q28, [%x[Cpanel], #0x170]\n"
+ "add %x[Cpanel], %x[Cpanel], #0x180\n"
+ "bgt 2b\n"
+ "subs %x[ablocks], %x[ablocks], #0x1\n"
+ "bne 1b\n"
+ : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+ : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index f492a474ae..fa93c1d90d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -35,6 +35,7 @@ namespace arm_gemm
{
// Actual kernel implementations
void a64_interleaved_u8u32_mmla_8x12( ARGLIST );
+void a64_interleaved_u8u32_mmla_8x12_a510( ARGLIST );
class cls_a64_interleaved_u8u32_mmla_8x12
{
@@ -91,7 +92,7 @@ public:
case CPUModel::A510:
return { 47.66, 2.47, 0.29 };
case CPUModel::V1:
- return { 111.60, 4.95, 0.66 };
+ return { 75.54, 8.06, 0.63 };
}
}
@@ -100,8 +101,15 @@ public:
// Default to the generic kernel
kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
- cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
+ cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *ci)
{
+ switch(ci->get_cpu_model()) {
+ default:
+ break;
+ case CPUModel::A510:
+ kernel=a64_interleaved_u8u32_mmla_8x12_a510;
+ break;
+ }
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
new file mode 100644
index 0000000000..3fe1a9bd04
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/a510.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_u8u32_mmla_8x12_a510(
+ const uint8_t *Apanel, const uint8_t *Bpanel,
+ uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+ struct KernelArgs {
+ size_t bblocks = {};
+ size_t K = {};
+ const uint8_t *Bpanel = {};
+ } ka;
+
+ ka.bblocks = bblocks;
+ ka.K = (K/8) - 1;
+ ka.Bpanel = Bpanel;
+
+ __asm__ __volatile__(
+
+ "1:" // Height loop
+ "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+ "mov x21, %x[Apanel]\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+ "2:" // Width loop
+ "mov %x[Apanel], x21\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v4.16b }, [x20], #0x10\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ "cmp x19, #0x2\n"
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "ld1 { v5.16b }, [x20], #0x10\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "blt 4f\n"
+ "3:" // main loop head
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ "sub x19, x19, #0x2\n"
+ ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+ "cmp x19, #0x2\n"
+ ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n"
+ "ld1 { v6.16b }, [x20], #0x10\n"
+ ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n"
+ "ld1 { v7.16b }, [x20], #0x10\n"
+ ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n"
+ "ld1 { v4.16b }, [x20], #0x10\n"
+ ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n"
+ "ld1 { v5.16b }, [x20], #0x10\n"
+ ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
+ "bge 3b\n"
+ "4:" // main loop skip
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e86a409 // ummla v9.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e86a42f // ummla v15.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e86a455 // ummla v21.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e86a47b // ummla v27.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a40c // ummla v12.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e87a432 // ummla v18.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e87a458 // ummla v24.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e87a47e // ummla v30.4s, v3.16b, v7.16b\n"
+ ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
+ "cbz x19, 5f\n"
+ "ld1 { v0.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v6.16b }, [x20], #0x10\n"
+ ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
+ "ld1 { v1.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v7.16b }, [x20], #0x10\n"
+ ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
+ "ld1 { v2.16b }, [%x[Apanel]], #0x10\n"
+ "ld1 { v3.16b }, [%x[Apanel]], #0x10\n"
+ ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
+ "ldp q4, q5, [x20], #0x20\n"
+ ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
+ "ldp q6, q7, [x20], #0x20\n"
+ ".inst 0x6e84a409 // ummla v9.4s, v0.16b, v4.16b\n"
+ ".inst 0x6e84a42f // ummla v15.4s, v1.16b, v4.16b\n"
+ ".inst 0x6e84a455 // ummla v21.4s, v2.16b, v4.16b\n"
+ ".inst 0x6e84a47b // ummla v27.4s, v3.16b, v4.16b\n"
+ ".inst 0x6e85a40c // ummla v12.4s, v0.16b, v5.16b\n"
+ ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
+ ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
+ ".inst 0x6e85a432 // ummla v18.4s, v1.16b, v5.16b\n"
+ ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
+ ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
+ ".inst 0x6e85a458 // ummla v24.4s, v2.16b, v5.16b\n"
+ ".inst 0x6e85a47e // ummla v30.4s, v3.16b, v5.16b\n"
+ ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
+ ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
+ ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
+ ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
+ "5:" // multiply loop done
+ "subs x22, x22, #0x1\n"
+ "uzp1 v4.2d, v8.2d, v11.2d\n"
+ "uzp2 v8.2d, v8.2d, v11.2d\n"
+ "uzp1 v11.2d, v9.2d, v12.2d\n"
+ "uzp2 v9.2d, v9.2d, v12.2d\n"
+ "str q4, [%x[Cpanel], #0x0]\n"
+ "uzp1 v12.2d, v10.2d, v13.2d\n"
+ "uzp2 v10.2d, v10.2d, v13.2d\n"
+ "str q11, [%x[Cpanel], #0x10]\n"
+ "str q12, [%x[Cpanel], #0x20]\n"
+ "uzp1 v13.2d, v14.2d, v17.2d\n"
+ "uzp2 v14.2d, v14.2d, v17.2d\n"
+ "str q8, [%x[Cpanel], #0x30]\n"
+ "uzp1 v17.2d, v15.2d, v18.2d\n"
+ "uzp2 v15.2d, v15.2d, v18.2d\n"
+ "str q9, [%x[Cpanel], #0x40]\n"
+ "uzp1 v18.2d, v16.2d, v19.2d\n"
+ "uzp2 v16.2d, v16.2d, v19.2d\n"
+ "str q10, [%x[Cpanel], #0x50]\n"
+ "uzp1 v19.2d, v20.2d, v23.2d\n"
+ "uzp2 v20.2d, v20.2d, v23.2d\n"
+ "str q13, [%x[Cpanel], #0x60]\n"
+ "uzp1 v23.2d, v21.2d, v24.2d\n"
+ "uzp2 v21.2d, v21.2d, v24.2d\n"
+ "str q17, [%x[Cpanel], #0x70]\n"
+ "uzp1 v24.2d, v22.2d, v25.2d\n"
+ "uzp2 v22.2d, v22.2d, v25.2d\n"
+ "str q18, [%x[Cpanel], #0x80]\n"
+ "uzp1 v25.2d, v26.2d, v29.2d\n"
+ "uzp2 v26.2d, v26.2d, v29.2d\n"
+ "str q14, [%x[Cpanel], #0x90]\n"
+ "uzp1 v29.2d, v27.2d, v30.2d\n"
+ "uzp2 v27.2d, v27.2d, v30.2d\n"
+ "str q15, [%x[Cpanel], #0xa0]\n"
+ "uzp1 v30.2d, v28.2d, v31.2d\n"
+ "uzp2 v28.2d, v28.2d, v31.2d\n"
+ "str q16, [%x[Cpanel], #0xb0]\n"
+ "str q19, [%x[Cpanel], #0xc0]\n"
+ "str q23, [%x[Cpanel], #0xd0]\n"
+ "str q24, [%x[Cpanel], #0xe0]\n"
+ "str q20, [%x[Cpanel], #0xf0]\n"
+ "str q21, [%x[Cpanel], #0x100]\n"
+ "str q22, [%x[Cpanel], #0x110]\n"
+ "str q25, [%x[Cpanel], #0x120]\n"
+ "str q29, [%x[Cpanel], #0x130]\n"
+ "str q30, [%x[Cpanel], #0x140]\n"
+ "str q26, [%x[Cpanel], #0x150]\n"
+ "str q27, [%x[Cpanel], #0x160]\n"
+ "str q28, [%x[Cpanel], #0x170]\n"
+ "add %x[Cpanel], %x[Cpanel], #0x180\n"
+ "bgt 2b\n"
+ "subs %x[ablocks], %x[ablocks], #0x1\n"
+ "bne 1b\n"
+ : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+ : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
index 30e265fbc0..ab175a3758 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -83,7 +83,7 @@ public:
case CPUModel::A510:
return { 5.42 };
case CPUModel::V1:
- return { 34.56 };
+ return { 20.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
index 61c7ad17e7..b7c9aca9dd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
@@ -83,7 +83,7 @@ public:
case CPUModel::A510:
return { 5.31 };
case CPUModel::V1:
- return { 28.93 };
+ return { 17.32 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index b8ca7c5456..28057aa961 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
case CPUModel::A510:
return { 22.77, 3.90, 0.47 };
case CPUModel::V1:
- return { 62.97, 19.14, 0.92 };
+ return { 48.09, 16.24, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
index b88ef14f25..c08977570e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
case CPUModel::A510:
return { 23.87, 3.89, 0.37 };
case CPUModel::V1:
- return { 107.63, 19.24, 0.92 };
+ return { 75.14, 15.87, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index d870711c6e..901cc6d63e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
case CPUModel::A510:
return { 22.75, 3.90, 0.47 };
case CPUModel::V1:
- return { 62.97, 19.27, 0.92 };
+ return { 48.09, 16.24, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
index 7f8eadc528..c0d089278e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
@@ -94,7 +94,7 @@ public:
case CPUModel::A510:
return { 26.80, 3.89, 0.47 };
case CPUModel::V1:
- return { 108.33, 18.66, 0.92 };
+ return { 75.14, 15.87, 0.83 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index fa44bdbd31..fc91dd71ad 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -80,7 +80,7 @@ public:
case CPUModel::A510:
return { 7.78, 4.01, 2.43 };
case CPUModel::V1:
- return { 62.50, 5.09, 11.32 };
+ return { 47.63, 5.11, 6.80 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index 1924b2cf75..0d707b0391 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -89,7 +89,7 @@ public:
default:
return { 31.67, 3.57, 0.50 };
case CPUModel::V1:
- return { 63.35, 4.76, 0.77 };
+ return { 52.24, 7.49, 0.80 };
case CPUModel::A510:
return { 27.47, 1.70, 0.28 };
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index bd1764bb7f..4e65296f8b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -89,7 +89,7 @@ public:
default:
return { 61.97, 3.64, 0.50 };
case CPUModel::V1:
- return { 123.84, 4.93, 0.76 };
+ return { 95.28, 7.99, 0.79 };
case CPUModel::A510:
return { 43.36, 1.86, 0.28 };
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index f66a9bf51e..0afcdd2ce4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -91,7 +91,7 @@ public:
case CPUModel::A510:
return { 27.45, 1.65, 0.28 };
case CPUModel::V1:
- return { 63.35, 4.96, 0.77 };
+ return { 52.24, 7.49, 0.80 };
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index b530202bd7..58d21d6c40 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -91,7 +91,7 @@ public:
case CPUModel::A510:
return { 38.02, 1.85, 0.28 };
case CPUModel::V1:
- return { 123.84, 4.98, 0.76 };
+ return { 95.28, 7.99, 0.79 };
}
}