aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-07-27 19:04:51 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-07-28 10:37:51 +0000
commit97426a707467a2e025a669fc5b36cc6f6274c23a (patch)
tree7898df0dd00627b34a293a5609bfb8845a51551a
parent980558373f14c135a331fa23b61558c7d177edf2 (diff)
downloadComputeLibrary-97426a707467a2e025a669fc5b36cc6f6274c23a.tar.gz
Remove generated kernels that overlap hand-written ones
Generated kernels are not used at the moment. Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I3ba767a53f78e4409c70a850c8051f6ee7453358 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6008 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp12
-rw-r--r--filelist.json12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp110
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp263
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp247
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp247
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp115
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp360
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp320
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp320
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp110
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp273
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp253
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp253
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp110
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp273
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp253
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp253
20 files changed, 4 insertions, 3784 deletions
diff --git a/Android.bp b/Android.bp
index 0502e841f1..09383551d5 100644
--- a/Android.bp
+++ b/Android.bp
@@ -877,19 +877,7 @@ cc_library_static {
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
diff --git a/filelist.json b/filelist.json
index 394ec0441a..4c7c81c63e 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1287,19 +1287,7 @@
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
index d91c69b8a0..48ce67613e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -79,6 +79,8 @@ public:
switch (ci->get_cpu_model()) {
default:
return { 31.65 };
+ case CPUModel::A55r1:
+ return { 9.217 };
case CPUModel::A510:
return { 15.87 };
case CPUModel::V1:
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index 3a77397632..c5105a6d4a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -92,6 +92,8 @@ public:
switch (ci->get_cpu_model()) {
default:
return { 31.63 };
+ case CPUModel::A55r1:
+ return { 9.217 };
case CPUModel::A510:
return { 15.89 };
case CPUModel::V1:
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp
deleted file mode 100644
index ce63600424..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-#include "../std_transforms_fixed.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- const __fp16 *, const __fp16 *, \
- __fp16 *, int, int, int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void a64_interleaved_fp16_mla_8x24( ARGLIST );
-void a64_interleaved_fp16_mla_8x24_a55( ARGLIST );
-void a64_interleaved_fp16_mla_8x24_x1( ARGLIST );
-
-class cls_a64_interleaved_fp16_mla_8x24
-{
-public:
- typedef __fp16 operand_type;
- typedef __fp16 result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
-
- static unsigned int out_width()
- {
- return 24;
- }
-
- static unsigned int stripe_width()
- {
- return 8;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 1;
- }
-
-
- StdTransformsFixed<operand_type, result_type, 8, 24, 1> transforms = {};
- StdTransformsFixed<operand_type, result_type, 8, 24, 1, true> transforms_quantized = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
-
- if (std::is_same<T, __fp16>::value) {
- switch (ci->get_cpu_model()) {
- case CPUModel::A55r1:
- return { 7.16, 1.14, 0.67 };
- default:
- return { 12.67, 3.98, 1.16 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=a64_interleaved_fp16_mla_8x24;
- cls_a64_interleaved_fp16_mla_8x24(const CPUInfo *ci)
- {
- switch(ci->get_cpu_model()) {
- default:
- break;
- case CPUModel::A55r1:
- kernel=a64_interleaved_fp16_mla_8x24_a55;
- break;
- case CPUModel::X1:
- kernel=a64_interleaved_fp16_mla_8x24_x1;
- break;
- }
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp
deleted file mode 100644
index 49500f2d18..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
-
-#include <cstddef>
-
-namespace arm_gemm {
-
-void a64_interleaved_fp16_mla_8x24_a55(
- const __fp16 *Apanel, const __fp16 *Bpanel,
- __fp16 *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const __fp16 *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/1) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x10, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x9, %x[Apanel]\n"
- "ldr x28, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x27, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x9\n"
- "cmp x27, #0x2\n"
- "movi v8.16b, #0x0\n"
- "movi v9.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.16b, #0x0\n"
- "prfm pldl1keep, [x28, #0x0]\n"
- "movi v11.16b, #0x0\n"
- "prfm pldl1keep, [x28, #0x40]\n"
- "movi v12.16b, #0x0\n"
- "prfm pldl1keep, [x28, #0x80]\n"
- "movi v13.16b, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v14.16b, #0x0\n"
- "ldr q2, [x28, #0x0]\n"
- "movi v15.16b, #0x0\n"
- "ldr q3, [x28, #0x10]\n"
- "movi v16.16b, #0x0\n"
- "ldr q4, [x28, #0x20]\n"
- "movi v17.16b, #0x0\n"
- "movi v18.16b, #0x0\n"
- "movi v19.16b, #0x0\n"
- "movi v20.16b, #0x0\n"
- "movi v21.16b, #0x0\n"
- "movi v22.16b, #0x0\n"
- "movi v23.16b, #0x0\n"
- "movi v24.16b, #0x0\n"
- "movi v25.16b, #0x0\n"
- "movi v26.16b, #0x0\n"
- "movi v27.16b, #0x0\n"
- "movi v28.16b, #0x0\n"
- "movi v29.16b, #0x0\n"
- "movi v30.16b, #0x0\n"
- "movi v31.16b, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- "ldr d1, [%x[Apanel], #0x10]\n"
- "fmla v8.8h, v2.8h, v0.h[0]\n"
- "ldr x26, [%x[Apanel], #0x18]\n"
- "fmla v11.8h, v2.8h, v0.h[1]\n"
- "ldr d5, [x28, #0x30]\n"
- "fmla v14.8h, v2.8h, v0.h[2]\n"
- "ldr x25, [x28, #0x38]\n"
- "fmla v17.8h, v2.8h, v0.h[3]\n"
- "ldr d6, [x28, #0x40]\n"
- "fmla v20.8h, v2.8h, v0.h[4]\n"
- "ldr x24, [x28, #0x48]\n"
- "fmla v23.8h, v2.8h, v0.h[5]\n"
- "ldr d7, [x28, #0x50]\n"
- "fmla v26.8h, v2.8h, v0.h[6]\n"
- "ldr x23, [x28, #0x58]\n"
- "fmla v29.8h, v2.8h, v0.h[7]\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla v9.8h, v3.8h, v0.h[0]\n"
- "prfm pldl1keep, [x28, #0x100]\n"
- "fmla v12.8h, v3.8h, v0.h[1]\n"
- "prfm pldl1keep, [x28, #0x140]\n"
- "fmla v15.8h, v3.8h, v0.h[2]\n"
- "add x28, x28, #0x60\n"
- "fmla v18.8h, v3.8h, v0.h[3]\n"
- "ldr d2, [x28, #0x0]\n"
- "fmla v21.8h, v3.8h, v0.h[4]\n"
- "ldr x22, [x28, #0x8]\n"
- "fmla v24.8h, v3.8h, v0.h[5]\n"
- "ldr x21, [x28, #0x18]\n"
- "fmla v27.8h, v3.8h, v0.h[6]\n"
- "ldr x20, [%x[Apanel], #0x8]\n"
- "fmla v30.8h, v3.8h, v0.h[7]\n"
- "ldr d3, [x28, #0x10]\n"
- "fmla v10.8h, v4.8h, v0.h[0]\n"
- "ldr x19, [x28, #0x28]\n"
- "fmla v13.8h, v4.8h, v0.h[1]\n"
- "mov v1.d[1], x26\n"
- "fmla v16.8h, v4.8h, v0.h[2]\n"
- "mov v5.d[1], x25\n"
- "fmla v19.8h, v4.8h, v0.h[3]\n"
- "mov v6.d[1], x24\n"
- "fmla v22.8h, v4.8h, v0.h[4]\n"
- "mov v7.d[1], x23\n"
- "fmla v25.8h, v4.8h, v0.h[5]\n"
- "sub x27, x27, #0x2\n"
- "fmla v28.8h, v4.8h, v0.h[6]\n"
- "cmp x27, #0x2\n"
- "fmla v31.8h, v4.8h, v0.h[7]\n"
- "ldr d0, [%x[Apanel], #0x0]\n"
- "ldr d4, [x28, #0x20]\n"
- "mov v2.d[1], x22\n"
- "mov v3.d[1], x21\n"
- "fmla v8.8h, v5.8h, v1.h[0]\n"
- "mov v0.d[1], x20\n"
- "fmla v11.8h, v5.8h, v1.h[1]\n"
- "mov v4.d[1], x19\n"
- "fmla v14.8h, v5.8h, v1.h[2]\n"
- "fmla v17.8h, v5.8h, v1.h[3]\n"
- "fmla v20.8h, v5.8h, v1.h[4]\n"
- "fmla v23.8h, v5.8h, v1.h[5]\n"
- "fmla v26.8h, v5.8h, v1.h[6]\n"
- "fmla v29.8h, v5.8h, v1.h[7]\n"
- "fmla v9.8h, v6.8h, v1.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v15.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v1.h[3]\n"
- "fmla v21.8h, v6.8h, v1.h[4]\n"
- "fmla v24.8h, v6.8h, v1.h[5]\n"
- "fmla v27.8h, v6.8h, v1.h[6]\n"
- "fmla v30.8h, v6.8h, v1.h[7]\n"
- "fmla v10.8h, v7.8h, v1.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v16.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v1.h[3]\n"
- "fmla v22.8h, v7.8h, v1.h[4]\n"
- "fmla v25.8h, v7.8h, v1.h[5]\n"
- "fmla v28.8h, v7.8h, v1.h[6]\n"
- "fmla v31.8h, v7.8h, v1.h[7]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla v8.8h, v2.8h, v0.h[0]\n"
- "add x28, x28, #0x30\n"
- "fmla v11.8h, v2.8h, v0.h[1]\n"
- "fmla v14.8h, v2.8h, v0.h[2]\n"
- "fmla v17.8h, v2.8h, v0.h[3]\n"
- "fmla v20.8h, v2.8h, v0.h[4]\n"
- "fmla v23.8h, v2.8h, v0.h[5]\n"
- "fmla v26.8h, v2.8h, v0.h[6]\n"
- "fmla v29.8h, v2.8h, v0.h[7]\n"
- "fmla v9.8h, v3.8h, v0.h[0]\n"
- "fmla v12.8h, v3.8h, v0.h[1]\n"
- "fmla v15.8h, v3.8h, v0.h[2]\n"
- "fmla v18.8h, v3.8h, v0.h[3]\n"
- "fmla v21.8h, v3.8h, v0.h[4]\n"
- "fmla v24.8h, v3.8h, v0.h[5]\n"
- "fmla v27.8h, v3.8h, v0.h[6]\n"
- "fmla v30.8h, v3.8h, v0.h[7]\n"
- "fmla v10.8h, v4.8h, v0.h[0]\n"
- "fmla v13.8h, v4.8h, v0.h[1]\n"
- "fmla v16.8h, v4.8h, v0.h[2]\n"
- "fmla v19.8h, v4.8h, v0.h[3]\n"
- "fmla v22.8h, v4.8h, v0.h[4]\n"
- "fmla v25.8h, v4.8h, v0.h[5]\n"
- "fmla v28.8h, v4.8h, v0.h[6]\n"
- "fmla v31.8h, v4.8h, v0.h[7]\n"
- "cbz x27, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- "ldr q5, [x28, #0x0]\n"
- "fmla v8.8h, v5.8h, v0.h[0]\n"
- "ldr q6, [x28, #0x10]\n"
- "fmla v11.8h, v5.8h, v0.h[1]\n"
- "ldr q7, [x28, #0x20]\n"
- "fmla v14.8h, v5.8h, v0.h[2]\n"
- "fmla v17.8h, v5.8h, v0.h[3]\n"
- "add x28, x28, #0x30\n"
- "fmla v20.8h, v5.8h, v0.h[4]\n"
- "fmla v23.8h, v5.8h, v0.h[5]\n"
- "fmla v26.8h, v5.8h, v0.h[6]\n"
- "fmla v29.8h, v5.8h, v0.h[7]\n"
- "fmla v9.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v0.h[1]\n"
- "fmla v15.8h, v6.8h, v0.h[2]\n"
- "fmla v18.8h, v6.8h, v0.h[3]\n"
- "fmla v21.8h, v6.8h, v0.h[4]\n"
- "fmla v24.8h, v6.8h, v0.h[5]\n"
- "fmla v27.8h, v6.8h, v0.h[6]\n"
- "fmla v30.8h, v6.8h, v0.h[7]\n"
- "fmla v10.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v0.h[1]\n"
- "fmla v16.8h, v7.8h, v0.h[2]\n"
- "fmla v19.8h, v7.8h, v0.h[3]\n"
- "fmla v22.8h, v7.8h, v0.h[4]\n"
- "fmla v25.8h, v7.8h, v0.h[5]\n"
- "fmla v28.8h, v7.8h, v0.h[6]\n"
- "fmla v31.8h, v7.8h, v0.h[7]\n"
- "5:" // multiply loop done
- "subs x10, x10, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp
deleted file mode 100644
index a9da6956ed..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
-
-#include <cstddef>
-
-namespace arm_gemm {
-
-void a64_interleaved_fp16_mla_8x24(
- const __fp16 *Apanel, const __fp16 *Bpanel,
- __fp16 *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const __fp16 *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/1) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x21, %x[Apanel]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x21\n"
- "cmp x19, #0x2\n"
- "movi v8.16b, #0x0\n"
- "movi v9.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.16b, #0x0\n"
- "movi v11.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x0]\n"
- "movi v12.16b, #0x0\n"
- "movi v13.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x40]\n"
- "movi v14.16b, #0x0\n"
- "movi v15.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "movi v16.16b, #0x0\n"
- "movi v17.16b, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v18.16b, #0x0\n"
- "movi v19.16b, #0x0\n"
- "ldr q2, [x20, #0x0]\n"
- "movi v20.16b, #0x0\n"
- "movi v21.16b, #0x0\n"
- "ldr q3, [x20, #0x10]\n"
- "movi v22.16b, #0x0\n"
- "movi v23.16b, #0x0\n"
- "ldr q4, [x20, #0x20]\n"
- "movi v24.16b, #0x0\n"
- "movi v25.16b, #0x0\n"
- "movi v26.16b, #0x0\n"
- "movi v27.16b, #0x0\n"
- "movi v28.16b, #0x0\n"
- "movi v29.16b, #0x0\n"
- "movi v30.16b, #0x0\n"
- "movi v31.16b, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- "fmla v8.8h, v2.8h, v0.h[0]\n"
- "fmla v11.8h, v2.8h, v0.h[1]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "fmla v14.8h, v2.8h, v0.h[2]\n"
- "fmla v17.8h, v2.8h, v0.h[3]\n"
- "ldr q5, [x20, #0x30]\n"
- "fmla v20.8h, v2.8h, v0.h[4]\n"
- "fmla v23.8h, v2.8h, v0.h[5]\n"
- "ldr q6, [x20, #0x40]\n"
- "fmla v26.8h, v2.8h, v0.h[6]\n"
- "fmla v29.8h, v2.8h, v0.h[7]\n"
- "ldr q7, [x20, #0x50]\n"
- "fmla v9.8h, v3.8h, v0.h[0]\n"
- "fmla v12.8h, v3.8h, v0.h[1]\n"
- "sub x19, x19, #0x2\n"
- "fmla v15.8h, v3.8h, v0.h[2]\n"
- "fmla v18.8h, v3.8h, v0.h[3]\n"
- "cmp x19, #0x2\n"
- "fmla v21.8h, v3.8h, v0.h[4]\n"
- "fmla v24.8h, v3.8h, v0.h[5]\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- "fmla v27.8h, v3.8h, v0.h[6]\n"
- "fmla v30.8h, v3.8h, v0.h[7]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla v10.8h, v4.8h, v0.h[0]\n"
- "fmla v13.8h, v4.8h, v0.h[1]\n"
- "prfm pldl1keep, [x20, #0x100]\n"
- "fmla v16.8h, v4.8h, v0.h[2]\n"
- "fmla v19.8h, v4.8h, v0.h[3]\n"
- "prfm pldl1keep, [x20, #0x140]\n"
- "fmla v22.8h, v4.8h, v0.h[4]\n"
- "fmla v25.8h, v4.8h, v0.h[5]\n"
- "add x20, x20, #0x60\n"
- "fmla v28.8h, v4.8h, v0.h[6]\n"
- "fmla v31.8h, v4.8h, v0.h[7]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "fmla v8.8h, v5.8h, v1.h[0]\n"
- "fmla v11.8h, v5.8h, v1.h[1]\n"
- "ldr q2, [x20, #0x0]\n"
- "fmla v14.8h, v5.8h, v1.h[2]\n"
- "fmla v17.8h, v5.8h, v1.h[3]\n"
- "ldr q3, [x20, #0x10]\n"
- "fmla v20.8h, v5.8h, v1.h[4]\n"
- "fmla v23.8h, v5.8h, v1.h[5]\n"
- "ldr q4, [x20, #0x20]\n"
- "fmla v26.8h, v5.8h, v1.h[6]\n"
- "fmla v29.8h, v5.8h, v1.h[7]\n"
- "fmla v9.8h, v6.8h, v1.h[0]\n"
- "fmla v12.8h, v6.8h, v1.h[1]\n"
- "fmla v15.8h, v6.8h, v1.h[2]\n"
- "fmla v18.8h, v6.8h, v1.h[3]\n"
- "fmla v21.8h, v6.8h, v1.h[4]\n"
- "fmla v24.8h, v6.8h, v1.h[5]\n"
- "fmla v27.8h, v6.8h, v1.h[6]\n"
- "fmla v30.8h, v6.8h, v1.h[7]\n"
- "fmla v10.8h, v7.8h, v1.h[0]\n"
- "fmla v13.8h, v7.8h, v1.h[1]\n"
- "fmla v16.8h, v7.8h, v1.h[2]\n"
- "fmla v19.8h, v7.8h, v1.h[3]\n"
- "fmla v22.8h, v7.8h, v1.h[4]\n"
- "fmla v25.8h, v7.8h, v1.h[5]\n"
- "fmla v28.8h, v7.8h, v1.h[6]\n"
- "fmla v31.8h, v7.8h, v1.h[7]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "fmla v8.8h, v2.8h, v0.h[0]\n"
- "fmla v11.8h, v2.8h, v0.h[1]\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla v14.8h, v2.8h, v0.h[2]\n"
- "fmla v17.8h, v2.8h, v0.h[3]\n"
- "add x20, x20, #0x30\n"
- "fmla v20.8h, v2.8h, v0.h[4]\n"
- "fmla v23.8h, v2.8h, v0.h[5]\n"
- "fmla v26.8h, v2.8h, v0.h[6]\n"
- "fmla v29.8h, v2.8h, v0.h[7]\n"
- "fmla v9.8h, v3.8h, v0.h[0]\n"
- "fmla v12.8h, v3.8h, v0.h[1]\n"
- "fmla v15.8h, v3.8h, v0.h[2]\n"
- "fmla v18.8h, v3.8h, v0.h[3]\n"
- "fmla v21.8h, v3.8h, v0.h[4]\n"
- "fmla v24.8h, v3.8h, v0.h[5]\n"
- "fmla v27.8h, v3.8h, v0.h[6]\n"
- "fmla v30.8h, v3.8h, v0.h[7]\n"
- "fmla v10.8h, v4.8h, v0.h[0]\n"
- "fmla v13.8h, v4.8h, v0.h[1]\n"
- "fmla v16.8h, v4.8h, v0.h[2]\n"
- "fmla v19.8h, v4.8h, v0.h[3]\n"
- "fmla v22.8h, v4.8h, v0.h[4]\n"
- "fmla v25.8h, v4.8h, v0.h[5]\n"
- "fmla v28.8h, v4.8h, v0.h[6]\n"
- "fmla v31.8h, v4.8h, v0.h[7]\n"
- "cbz x19, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q5, [x20, #0x0]\n"
- "fmla v8.8h, v5.8h, v0.h[0]\n"
- "ldr q6, [x20, #0x10]\n"
- "ldr q7, [x20, #0x20]\n"
- "fmla v11.8h, v5.8h, v0.h[1]\n"
- "fmla v14.8h, v5.8h, v0.h[2]\n"
- "fmla v17.8h, v5.8h, v0.h[3]\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla v20.8h, v5.8h, v0.h[4]\n"
- "fmla v23.8h, v5.8h, v0.h[5]\n"
- "add x20, x20, #0x30\n"
- "fmla v26.8h, v5.8h, v0.h[6]\n"
- "fmla v29.8h, v5.8h, v0.h[7]\n"
- "fmla v9.8h, v6.8h, v0.h[0]\n"
- "fmla v12.8h, v6.8h, v0.h[1]\n"
- "fmla v15.8h, v6.8h, v0.h[2]\n"
- "fmla v18.8h, v6.8h, v0.h[3]\n"
- "fmla v21.8h, v6.8h, v0.h[4]\n"
- "fmla v24.8h, v6.8h, v0.h[5]\n"
- "fmla v27.8h, v6.8h, v0.h[6]\n"
- "fmla v30.8h, v6.8h, v0.h[7]\n"
- "fmla v10.8h, v7.8h, v0.h[0]\n"
- "fmla v13.8h, v7.8h, v0.h[1]\n"
- "fmla v16.8h, v7.8h, v0.h[2]\n"
- "fmla v19.8h, v7.8h, v0.h[3]\n"
- "fmla v22.8h, v7.8h, v0.h[4]\n"
- "fmla v25.8h, v7.8h, v0.h[5]\n"
- "fmla v28.8h, v7.8h, v0.h[6]\n"
- "fmla v31.8h, v7.8h, v0.h[7]\n"
- "5:" // multiply loop done
- "subs x22, x22, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp
deleted file mode 100644
index efaedeb33f..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
-
-#include <cstddef>
-
-namespace arm_gemm {
-
-void a64_interleaved_fp16_mla_8x24_x1(
- const __fp16 *Apanel, const __fp16 *Bpanel,
- __fp16 *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const __fp16 *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/1) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x21, %x[Apanel]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x21\n"
- "cmp x19, #0x2\n"
- "movi v8.16b, #0x0\n"
- "movi v9.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.16b, #0x0\n"
- "movi v11.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x0]\n"
- "movi v12.16b, #0x0\n"
- "movi v13.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x40]\n"
- "movi v14.16b, #0x0\n"
- "movi v15.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "movi v16.16b, #0x0\n"
- "movi v17.16b, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v18.16b, #0x0\n"
- "movi v19.16b, #0x0\n"
- "ldr q1, [x20, #0x0]\n"
- "movi v20.16b, #0x0\n"
- "movi v21.16b, #0x0\n"
- "ldr q2, [x20, #0x10]\n"
- "movi v22.16b, #0x0\n"
- "movi v23.16b, #0x0\n"
- "ldr q3, [x20, #0x20]\n"
- "movi v24.16b, #0x0\n"
- "movi v25.16b, #0x0\n"
- "movi v26.16b, #0x0\n"
- "movi v27.16b, #0x0\n"
- "movi v28.16b, #0x0\n"
- "movi v29.16b, #0x0\n"
- "movi v30.16b, #0x0\n"
- "movi v31.16b, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- "fmla v8.8h, v1.8h, v0.h[0]\n"
- "fmla v11.8h, v1.8h, v0.h[1]\n"
- "sub x19, x19, #0x2\n"
- "fmla v14.8h, v1.8h, v0.h[2]\n"
- "fmla v17.8h, v1.8h, v0.h[3]\n"
- "cmp x19, #0x2\n"
- "fmla v20.8h, v1.8h, v0.h[4]\n"
- "fmla v23.8h, v1.8h, v0.h[5]\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- "fmla v26.8h, v1.8h, v0.h[6]\n"
- "fmla v29.8h, v1.8h, v0.h[7]\n"
- "ldr q1, [x20, #0x30]\n"
- "fmla v9.8h, v2.8h, v0.h[0]\n"
- "fmla v12.8h, v2.8h, v0.h[1]\n"
- "prfm pldl1keep, [x20, #0x100]\n"
- "fmla v15.8h, v2.8h, v0.h[2]\n"
- "fmla v18.8h, v2.8h, v0.h[3]\n"
- "prfm pldl1keep, [x20, #0x140]\n"
- "fmla v21.8h, v2.8h, v0.h[4]\n"
- "fmla v24.8h, v2.8h, v0.h[5]\n"
- "fmla v27.8h, v2.8h, v0.h[6]\n"
- "fmla v30.8h, v2.8h, v0.h[7]\n"
- "ldr q2, [x20, #0x40]\n"
- "fmla v10.8h, v3.8h, v0.h[0]\n"
- "fmla v13.8h, v3.8h, v0.h[1]\n"
- "fmla v16.8h, v3.8h, v0.h[2]\n"
- "fmla v19.8h, v3.8h, v0.h[3]\n"
- "fmla v22.8h, v3.8h, v0.h[4]\n"
- "fmla v25.8h, v3.8h, v0.h[5]\n"
- "fmla v28.8h, v3.8h, v0.h[6]\n"
- "fmla v31.8h, v3.8h, v0.h[7]\n"
- "ldr q0, [%x[Apanel], #0x10]\n"
- "ldr q3, [x20, #0x50]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "add x20, x20, #0x60\n"
- "fmla v8.8h, v1.8h, v0.h[0]\n"
- "fmla v11.8h, v1.8h, v0.h[1]\n"
- "fmla v14.8h, v1.8h, v0.h[2]\n"
- "fmla v17.8h, v1.8h, v0.h[3]\n"
- "fmla v20.8h, v1.8h, v0.h[4]\n"
- "fmla v23.8h, v1.8h, v0.h[5]\n"
- "fmla v26.8h, v1.8h, v0.h[6]\n"
- "fmla v29.8h, v1.8h, v0.h[7]\n"
- "ldr q1, [x20, #0x0]\n"
- "fmla v9.8h, v2.8h, v0.h[0]\n"
- "fmla v12.8h, v2.8h, v0.h[1]\n"
- "fmla v15.8h, v2.8h, v0.h[2]\n"
- "fmla v18.8h, v2.8h, v0.h[3]\n"
- "fmla v21.8h, v2.8h, v0.h[4]\n"
- "fmla v24.8h, v2.8h, v0.h[5]\n"
- "fmla v27.8h, v2.8h, v0.h[6]\n"
- "fmla v30.8h, v2.8h, v0.h[7]\n"
- "ldr q2, [x20, #0x10]\n"
- "fmla v10.8h, v3.8h, v0.h[0]\n"
- "fmla v13.8h, v3.8h, v0.h[1]\n"
- "fmla v16.8h, v3.8h, v0.h[2]\n"
- "fmla v19.8h, v3.8h, v0.h[3]\n"
- "fmla v22.8h, v3.8h, v0.h[4]\n"
- "fmla v25.8h, v3.8h, v0.h[5]\n"
- "fmla v28.8h, v3.8h, v0.h[6]\n"
- "fmla v31.8h, v3.8h, v0.h[7]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q3, [x20, #0x20]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "fmla v8.8h, v1.8h, v0.h[0]\n"
- "fmla v11.8h, v1.8h, v0.h[1]\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla v14.8h, v1.8h, v0.h[2]\n"
- "fmla v17.8h, v1.8h, v0.h[3]\n"
- "add x20, x20, #0x30\n"
- "fmla v20.8h, v1.8h, v0.h[4]\n"
- "fmla v23.8h, v1.8h, v0.h[5]\n"
- "fmla v26.8h, v1.8h, v0.h[6]\n"
- "fmla v29.8h, v1.8h, v0.h[7]\n"
- "fmla v9.8h, v2.8h, v0.h[0]\n"
- "fmla v12.8h, v2.8h, v0.h[1]\n"
- "fmla v15.8h, v2.8h, v0.h[2]\n"
- "fmla v18.8h, v2.8h, v0.h[3]\n"
- "fmla v21.8h, v2.8h, v0.h[4]\n"
- "fmla v24.8h, v2.8h, v0.h[5]\n"
- "fmla v27.8h, v2.8h, v0.h[6]\n"
- "fmla v30.8h, v2.8h, v0.h[7]\n"
- "fmla v10.8h, v3.8h, v0.h[0]\n"
- "fmla v13.8h, v3.8h, v0.h[1]\n"
- "fmla v16.8h, v3.8h, v0.h[2]\n"
- "fmla v19.8h, v3.8h, v0.h[3]\n"
- "fmla v22.8h, v3.8h, v0.h[4]\n"
- "fmla v25.8h, v3.8h, v0.h[5]\n"
- "fmla v28.8h, v3.8h, v0.h[6]\n"
- "fmla v31.8h, v3.8h, v0.h[7]\n"
- "cbz x19, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q4, [x20, #0x0]\n"
- "fmla v8.8h, v4.8h, v0.h[0]\n"
- "ldr q5, [x20, #0x10]\n"
- "ldr q6, [x20, #0x20]\n"
- "fmla v11.8h, v4.8h, v0.h[1]\n"
- "fmla v14.8h, v4.8h, v0.h[2]\n"
- "fmla v17.8h, v4.8h, v0.h[3]\n"
- "add %x[Apanel], %x[Apanel], #0x10\n"
- "fmla v20.8h, v4.8h, v0.h[4]\n"
- "fmla v23.8h, v4.8h, v0.h[5]\n"
- "add x20, x20, #0x30\n"
- "fmla v26.8h, v4.8h, v0.h[6]\n"
- "fmla v29.8h, v4.8h, v0.h[7]\n"
- "fmla v9.8h, v5.8h, v0.h[0]\n"
- "fmla v12.8h, v5.8h, v0.h[1]\n"
- "fmla v15.8h, v5.8h, v0.h[2]\n"
- "fmla v18.8h, v5.8h, v0.h[3]\n"
- "fmla v21.8h, v5.8h, v0.h[4]\n"
- "fmla v24.8h, v5.8h, v0.h[5]\n"
- "fmla v27.8h, v5.8h, v0.h[6]\n"
- "fmla v30.8h, v5.8h, v0.h[7]\n"
- "fmla v10.8h, v6.8h, v0.h[0]\n"
- "fmla v13.8h, v6.8h, v0.h[1]\n"
- "fmla v16.8h, v6.8h, v0.h[2]\n"
- "fmla v19.8h, v6.8h, v0.h[3]\n"
- "fmla v22.8h, v6.8h, v0.h[4]\n"
- "fmla v25.8h, v6.8h, v0.h[5]\n"
- "fmla v28.8h, v6.8h, v0.h[6]\n"
- "fmla v31.8h, v6.8h, v0.h[7]\n"
- "5:" // multiply loop done
- "subs x22, x22, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp
deleted file mode 100644
index 465a5b4e0f..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-#include "../std_transforms_fixed.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- const float *, const float *, \
- float *, int, int, int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void a64_interleaved_fp32_mla_8x12( ARGLIST );
-void a64_interleaved_fp32_mla_8x12_a55( ARGLIST );
-void a64_interleaved_fp32_mla_8x12_x1( ARGLIST );
-
-class cls_a64_interleaved_fp32_mla_8x12
-{
-public:
- typedef float operand_type;
- typedef float result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
-
- static unsigned int out_width()
- {
- return 12;
- }
-
- static unsigned int stripe_width()
- {
- return 4;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 1;
- }
-
-
- StdTransformsFixed<operand_type, result_type, 8, 12, 1> transforms = {};
- StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
-
- if (std::is_same<T, float>::value) {
- switch (ci->get_cpu_model()) {
- case CPUModel::A55r1:
- return { 3.954, 1.252, 1.141 };
- default:
- return { 7.2307, 3.876, 2.932 };
- case CPUModel::A73:
- return { 2.885, 1.429, 1.163 };
- case CPUModel::A53:
- return { 2.7, 0.9, 0.8 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=a64_interleaved_fp32_mla_8x12;
- cls_a64_interleaved_fp32_mla_8x12(const CPUInfo *ci)
- {
- switch(ci->get_cpu_model()) {
- default:
- break;
- case CPUModel::A55r1:
- case CPUModel::A53:
- kernel=a64_interleaved_fp32_mla_8x12_a55;
- break;
- case CPUModel::X1:
- kernel=a64_interleaved_fp32_mla_8x12_x1;
- break;
- }
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp
deleted file mode 100644
index 46d9ff73b9..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-
-namespace arm_gemm {
-
-void a64_interleaved_fp32_mla_8x12_a55(
- const float *Apanel, const float *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const float *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/1) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x28, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x27, %x[Apanel]\n"
- "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x25, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x27\n"
- "cmp x25, #0x4\n"
- "movi v8.16b, #0x0\n"
- "movi v9.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.16b, #0x0\n"
- "prfm pldl1keep, [x26, #0x0]\n"
- "movi v11.16b, #0x0\n"
- "prfm pldl1keep, [x26, #0x40]\n"
- "movi v12.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
- "movi v13.16b, #0x0\n"
- "prfm pldl1keep, [x26, #0x80]\n"
- "movi v14.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- "movi v15.16b, #0x0\n"
- "prfm pldl1keep, [x26, #0xc0]\n"
- "movi v16.16b, #0x0\n"
- "prfm pldl1keep, [x26, #0x100]\n"
- "movi v17.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0xc0]\n"
- "movi v18.16b, #0x0\n"
- "prfm pldl1keep, [x26, #0x140]\n"
- "movi v19.16b, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v20.16b, #0x0\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "movi v21.16b, #0x0\n"
- "ldr q4, [x26, #0x0]\n"
- "movi v22.16b, #0x0\n"
- "ldr q5, [x26, #0x10]\n"
- "movi v23.16b, #0x0\n"
- "ldr q6, [x26, #0x20]\n"
- "movi v24.16b, #0x0\n"
- "movi v25.16b, #0x0\n"
- "movi v26.16b, #0x0\n"
- "movi v27.16b, #0x0\n"
- "movi v28.16b, #0x0\n"
- "movi v29.16b, #0x0\n"
- "movi v30.16b, #0x0\n"
- "movi v31.16b, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- "ldr d2, [%x[Apanel], #0x20]\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "ldr x21, [%x[Apanel], #0x28]\n"
- "fmla v11.4s, v4.4s, v0.s[1]\n"
- "ldr d3, [%x[Apanel], #0x30]\n"
- "fmla v14.4s, v4.4s, v0.s[2]\n"
- "ldr x20, [%x[Apanel], #0x38]\n"
- "fmla v17.4s, v4.4s, v0.s[3]\n"
- "ldr d7, [x26, #0x30]\n"
- "fmla v20.4s, v4.4s, v1.s[0]\n"
- "ldr x24, [x26, #0x38]\n"
- "fmla v23.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "ldr x23, [x26, #0x48]\n"
- "fmla v29.4s, v4.4s, v1.s[3]\n"
- "ldr d4, [x26, #0x40]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "mov v2.d[1], x21\n"
- "fmla v12.4s, v5.4s, v0.s[1]\n"
- "mov v3.d[1], x20\n"
- "fmla v15.4s, v5.4s, v0.s[2]\n"
- "mov v7.d[1], x24\n"
- "fmla v18.4s, v5.4s, v0.s[3]\n"
- "mov v4.d[1], x23\n"
- "fmla v21.4s, v5.4s, v1.s[0]\n"
- "ldr x22, [x26, #0x58]\n"
- "fmla v24.4s, v5.4s, v1.s[1]\n"
- "ldr x21, [%x[Apanel], #0x48]\n"
- "fmla v27.4s, v5.4s, v1.s[2]\n"
- "ldr x20, [%x[Apanel], #0x58]\n"
- "fmla v30.4s, v5.4s, v1.s[3]\n"
- "ldr d5, [x26, #0x50]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "ldr x19, [x26, #0x68]\n"
- "fmla v13.4s, v6.4s, v0.s[1]\n"
- "ldr x24, [x26, #0x78]\n"
- "fmla v16.4s, v6.4s, v0.s[2]\n"
- "mov v5.d[1], x22\n"
- "fmla v19.4s, v6.4s, v0.s[3]\n"
- "ldr d0, [%x[Apanel], #0x40]\n"
- "fmla v22.4s, v6.4s, v1.s[0]\n"
- "mov v0.d[1], x21\n"
- "fmla v25.4s, v6.4s, v1.s[1]\n"
- "ldr x23, [x26, #0x88]\n"
- "fmla v28.4s, v6.4s, v1.s[2]\n"
- "ldr x21, [%x[Apanel], #0x68]\n"
- "fmla v31.4s, v6.4s, v1.s[3]\n"
- "ldr d1, [%x[Apanel], #0x50]\n"
- "ldr d6, [x26, #0x60]\n"
- "fmla v8.4s, v7.4s, v2.s[0]\n"
- "fmla v11.4s, v7.4s, v2.s[1]\n"
- "mov v1.d[1], x20\n"
- "fmla v14.4s, v7.4s, v2.s[2]\n"
- "mov v6.d[1], x19\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "ldr x20, [%x[Apanel], #0x78]\n"
- "fmla v20.4s, v7.4s, v3.s[0]\n"
- "ldr x22, [x26, #0x98]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "ldr x19, [x26, #0xa8]\n"
- "fmla v26.4s, v7.4s, v3.s[2]\n"
- "prfm pldl1keep, [%x[Apanel], #0x100]\n"
- "fmla v29.4s, v7.4s, v3.s[3]\n"
- "ldr d7, [x26, #0x70]\n"
- "fmla v9.4s, v4.4s, v2.s[0]\n"
- "mov v7.d[1], x24\n"
- "fmla v12.4s, v4.4s, v2.s[1]\n"
- "ldr x24, [x26, #0xb8]\n"
- "fmla v15.4s, v4.4s, v2.s[2]\n"
- "prfm pldl1keep, [x26, #0x180]\n"
- "fmla v18.4s, v4.4s, v2.s[3]\n"
- "prfm pldl1keep, [x26, #0x1c0]\n"
- "fmla v21.4s, v4.4s, v3.s[0]\n"
- "prfm pldl1keep, [%x[Apanel], #0x140]\n"
- "fmla v24.4s, v4.4s, v3.s[1]\n"
- "prfm pldl1keep, [x26, #0x200]\n"
- "fmla v27.4s, v4.4s, v3.s[2]\n"
- "sub x25, x25, #0x4\n"
- "fmla v30.4s, v4.4s, v3.s[3]\n"
- "ldr d4, [x26, #0x80]\n"
- "fmla v10.4s, v5.4s, v2.s[0]\n"
- "mov v4.d[1], x23\n"
- "fmla v13.4s, v5.4s, v2.s[1]\n"
- "cmp x25, #0x4\n"
- "fmla v16.4s, v5.4s, v2.s[2]\n"
- "fmla v19.4s, v5.4s, v2.s[3]\n"
- "ldr d2, [%x[Apanel], #0x60]\n"
- "fmla v22.4s, v5.4s, v3.s[0]\n"
- "mov v2.d[1], x21\n"
- "fmla v25.4s, v5.4s, v3.s[1]\n"
- "fmla v28.4s, v5.4s, v3.s[2]\n"
- "fmla v31.4s, v5.4s, v3.s[3]\n"
- "ldr d3, [%x[Apanel], #0x70]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "ldr d5, [x26, #0x90]\n"
- "fmla v11.4s, v6.4s, v0.s[1]\n"
- "mov v3.d[1], x20\n"
- "fmla v14.4s, v6.4s, v0.s[2]\n"
- "mov v5.d[1], x22\n"
- "fmla v17.4s, v6.4s, v0.s[3]\n"
- "add %x[Apanel], %x[Apanel], #0x80\n"
- "fmla v20.4s, v6.4s, v1.s[0]\n"
- "ldr x21, [%x[Apanel], #0x8]\n"
- "fmla v23.4s, v6.4s, v1.s[1]\n"
- "ldr x20, [%x[Apanel], #0x18]\n"
- "fmla v26.4s, v6.4s, v1.s[2]\n"
- "fmla v29.4s, v6.4s, v1.s[3]\n"
- "ldr d6, [x26, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "mov v6.d[1], x19\n"
- "fmla v12.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v0.s[2]\n"
- "fmla v18.4s, v7.4s, v0.s[3]\n"
- "fmla v21.4s, v7.4s, v1.s[0]\n"
- "fmla v24.4s, v7.4s, v1.s[1]\n"
- "fmla v27.4s, v7.4s, v1.s[2]\n"
- "fmla v30.4s, v7.4s, v1.s[3]\n"
- "ldr d7, [x26, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[0]\n"
- "add x26, x26, #0xc0\n"
- "fmla v13.4s, v4.4s, v0.s[1]\n"
- "ldr x23, [x26, #0x8]\n"
- "fmla v16.4s, v4.4s, v0.s[2]\n"
- "ldr x22, [x26, #0x18]\n"
- "fmla v19.4s, v4.4s, v0.s[3]\n"
- "ldr d0, [%x[Apanel], #0x0]\n"
- "fmla v22.4s, v4.4s, v1.s[0]\n"
- "ldr x19, [x26, #0x28]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "mov v7.d[1], x24\n"
- "fmla v28.4s, v4.4s, v1.s[2]\n"
- "mov v0.d[1], x21\n"
- "fmla v31.4s, v4.4s, v1.s[3]\n"
- "ldr d1, [%x[Apanel], #0x10]\n"
- "fmla v8.4s, v5.4s, v2.s[0]\n"
- "ldr d4, [x26, #0x0]\n"
- "fmla v11.4s, v5.4s, v2.s[1]\n"
- "mov v1.d[1], x20\n"
- "fmla v14.4s, v5.4s, v2.s[2]\n"
- "mov v4.d[1], x23\n"
- "fmla v17.4s, v5.4s, v2.s[3]\n"
- "fmla v20.4s, v5.4s, v3.s[0]\n"
- "fmla v23.4s, v5.4s, v3.s[1]\n"
- "fmla v26.4s, v5.4s, v3.s[2]\n"
- "fmla v29.4s, v5.4s, v3.s[3]\n"
- "ldr d5, [x26, #0x10]\n"
- "fmla v9.4s, v6.4s, v2.s[0]\n"
- "mov v5.d[1], x22\n"
- "fmla v12.4s, v6.4s, v2.s[1]\n"
- "fmla v15.4s, v6.4s, v2.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v21.4s, v6.4s, v3.s[0]\n"
- "fmla v24.4s, v6.4s, v3.s[1]\n"
- "fmla v27.4s, v6.4s, v3.s[2]\n"
- "fmla v30.4s, v6.4s, v3.s[3]\n"
- "ldr d6, [x26, #0x20]\n"
- "mov v6.d[1], x19\n"
- "fmla v10.4s, v7.4s, v2.s[0]\n"
- "fmla v13.4s, v7.4s, v2.s[1]\n"
- "fmla v16.4s, v7.4s, v2.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v22.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v3.s[1]\n"
- "fmla v28.4s, v7.4s, v3.s[2]\n"
- "fmla v31.4s, v7.4s, v3.s[3]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "add x26, x26, #0x30\n"
- "fmla v11.4s, v4.4s, v0.s[1]\n"
- "fmla v14.4s, v4.4s, v0.s[2]\n"
- "fmla v17.4s, v4.4s, v0.s[3]\n"
- "fmla v20.4s, v4.4s, v1.s[0]\n"
- "fmla v23.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v29.4s, v4.4s, v1.s[3]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "fmla v12.4s, v5.4s, v0.s[1]\n"
- "fmla v15.4s, v5.4s, v0.s[2]\n"
- "fmla v18.4s, v5.4s, v0.s[3]\n"
- "fmla v21.4s, v5.4s, v1.s[0]\n"
- "fmla v24.4s, v5.4s, v1.s[1]\n"
- "fmla v27.4s, v5.4s, v1.s[2]\n"
- "fmla v30.4s, v5.4s, v1.s[3]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v13.4s, v6.4s, v0.s[1]\n"
- "fmla v16.4s, v6.4s, v0.s[2]\n"
- "fmla v19.4s, v6.4s, v0.s[3]\n"
- "fmla v22.4s, v6.4s, v1.s[0]\n"
- "fmla v25.4s, v6.4s, v1.s[1]\n"
- "fmla v28.4s, v6.4s, v1.s[2]\n"
- "fmla v31.4s, v6.4s, v1.s[3]\n"
- "cbz x25, 6f\n"
- "5:" // odd loop
- "ldr q0, [%x[Apanel], #0x0]\n"
- "subs x25, x25, #0x1\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q7, [x26, #0x0]\n"
- "fmla v8.4s, v7.4s, v0.s[0]\n"
- "ldr q4, [x26, #0x10]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "ldr q5, [x26, #0x20]\n"
- "fmla v14.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v0.s[3]\n"
- "add x26, x26, #0x30\n"
- "fmla v20.4s, v7.4s, v1.s[0]\n"
- "fmla v23.4s, v7.4s, v1.s[1]\n"
- "fmla v26.4s, v7.4s, v1.s[2]\n"
- "fmla v29.4s, v7.4s, v1.s[3]\n"
- "fmla v9.4s, v4.4s, v0.s[0]\n"
- "fmla v12.4s, v4.4s, v0.s[1]\n"
- "fmla v15.4s, v4.4s, v0.s[2]\n"
- "fmla v18.4s, v4.4s, v0.s[3]\n"
- "fmla v21.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v1.s[1]\n"
- "fmla v27.4s, v4.4s, v1.s[2]\n"
- "fmla v30.4s, v4.4s, v1.s[3]\n"
- "fmla v10.4s, v5.4s, v0.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[1]\n"
- "fmla v16.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v0.s[3]\n"
- "fmla v22.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v1.s[1]\n"
- "fmla v28.4s, v5.4s, v1.s[2]\n"
- "fmla v31.4s, v5.4s, v1.s[3]\n"
- "bne 5b\n"
- "6:" // multiply loop done
- "subs x28, x28, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp
deleted file mode 100644
index 06dc1534c1..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-
-namespace arm_gemm {
-
-void a64_interleaved_fp32_mla_8x12(
- const float *Apanel, const float *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const float *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/1) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x21, %x[Apanel]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x21\n"
- "cmp x19, #0x4\n"
- "movi v8.16b, #0x0\n"
- "movi v9.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.16b, #0x0\n"
- "movi v11.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x0]\n"
- "movi v12.16b, #0x0\n"
- "movi v13.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x40]\n"
- "movi v14.16b, #0x0\n"
- "movi v15.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
- "movi v16.16b, #0x0\n"
- "movi v17.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "movi v18.16b, #0x0\n"
- "movi v19.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- "movi v20.16b, #0x0\n"
- "movi v21.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0xc0]\n"
- "movi v22.16b, #0x0\n"
- "movi v23.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x100]\n"
- "movi v24.16b, #0x0\n"
- "movi v25.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0xc0]\n"
- "movi v26.16b, #0x0\n"
- "movi v27.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x140]\n"
- "movi v28.16b, #0x0\n"
- "movi v29.16b, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v30.16b, #0x0\n"
- "movi v31.16b, #0x0\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q4, [x20, #0x0]\n"
- "ldr q5, [x20, #0x10]\n"
- "ldr q6, [x20, #0x20]\n"
- "blt 4f\n"
- "3:" // main loop head
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "fmla v11.4s, v4.4s, v0.s[1]\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- "fmla v14.4s, v4.4s, v0.s[2]\n"
- "fmla v17.4s, v4.4s, v0.s[3]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
- "fmla v20.4s, v4.4s, v1.s[0]\n"
- "fmla v23.4s, v4.4s, v1.s[1]\n"
- "ldr q7, [x20, #0x30]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v29.4s, v4.4s, v1.s[3]\n"
- "ldr q4, [x20, #0x40]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "fmla v12.4s, v5.4s, v0.s[1]\n"
- "sub x19, x19, #0x4\n"
- "fmla v15.4s, v5.4s, v0.s[2]\n"
- "fmla v18.4s, v5.4s, v0.s[3]\n"
- "cmp x19, #0x4\n"
- "fmla v21.4s, v5.4s, v1.s[0]\n"
- "fmla v24.4s, v5.4s, v1.s[1]\n"
- "prfm pldl1keep, [%x[Apanel], #0x100]\n"
- "fmla v27.4s, v5.4s, v1.s[2]\n"
- "fmla v30.4s, v5.4s, v1.s[3]\n"
- "ldr q5, [x20, #0x50]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v13.4s, v6.4s, v0.s[1]\n"
- "prfm pldl1keep, [x20, #0x180]\n"
- "fmla v16.4s, v6.4s, v0.s[2]\n"
- "fmla v19.4s, v6.4s, v0.s[3]\n"
- "ldr q0, [%x[Apanel], #0x40]\n"
- "fmla v22.4s, v6.4s, v1.s[0]\n"
- "fmla v25.4s, v6.4s, v1.s[1]\n"
- "prfm pldl1keep, [x20, #0x1c0]\n"
- "fmla v28.4s, v6.4s, v1.s[2]\n"
- "fmla v31.4s, v6.4s, v1.s[3]\n"
- "ldr q1, [%x[Apanel], #0x50]\n"
- "fmla v8.4s, v7.4s, v2.s[0]\n"
- "fmla v11.4s, v7.4s, v2.s[1]\n"
- "ldr q6, [x20, #0x60]\n"
- "fmla v14.4s, v7.4s, v2.s[2]\n"
- "fmla v17.4s, v7.4s, v2.s[3]\n"
- "prfm pldl1keep, [%x[Apanel], #0x140]\n"
- "fmla v20.4s, v7.4s, v3.s[0]\n"
- "fmla v23.4s, v7.4s, v3.s[1]\n"
- "prfm pldl1keep, [x20, #0x200]\n"
- "fmla v26.4s, v7.4s, v3.s[2]\n"
- "fmla v29.4s, v7.4s, v3.s[3]\n"
- "ldr q7, [x20, #0x70]\n"
- "fmla v9.4s, v4.4s, v2.s[0]\n"
- "fmla v12.4s, v4.4s, v2.s[1]\n"
- "fmla v15.4s, v4.4s, v2.s[2]\n"
- "fmla v18.4s, v4.4s, v2.s[3]\n"
- "fmla v21.4s, v4.4s, v3.s[0]\n"
- "fmla v24.4s, v4.4s, v3.s[1]\n"
- "fmla v27.4s, v4.4s, v3.s[2]\n"
- "fmla v30.4s, v4.4s, v3.s[3]\n"
- "ldr q4, [x20, #0x80]\n"
- "fmla v10.4s, v5.4s, v2.s[0]\n"
- "fmla v13.4s, v5.4s, v2.s[1]\n"
- "fmla v16.4s, v5.4s, v2.s[2]\n"
- "fmla v19.4s, v5.4s, v2.s[3]\n"
- "ldr q2, [%x[Apanel], #0x60]\n"
- "fmla v22.4s, v5.4s, v3.s[0]\n"
- "fmla v25.4s, v5.4s, v3.s[1]\n"
- "fmla v28.4s, v5.4s, v3.s[2]\n"
- "fmla v31.4s, v5.4s, v3.s[3]\n"
- "ldr q3, [%x[Apanel], #0x70]\n"
- "fmla v8.4s, v6.4s, v0.s[0]\n"
- "fmla v11.4s, v6.4s, v0.s[1]\n"
- "ldr q5, [x20, #0x90]\n"
- "fmla v14.4s, v6.4s, v0.s[2]\n"
- "fmla v17.4s, v6.4s, v0.s[3]\n"
- "add %x[Apanel], %x[Apanel], #0x80\n"
- "fmla v20.4s, v6.4s, v1.s[0]\n"
- "fmla v23.4s, v6.4s, v1.s[1]\n"
- "fmla v26.4s, v6.4s, v1.s[2]\n"
- "fmla v29.4s, v6.4s, v1.s[3]\n"
- "ldr q6, [x20, #0xa0]\n"
- "fmla v9.4s, v7.4s, v0.s[0]\n"
- "fmla v12.4s, v7.4s, v0.s[1]\n"
- "fmla v15.4s, v7.4s, v0.s[2]\n"
- "fmla v18.4s, v7.4s, v0.s[3]\n"
- "fmla v21.4s, v7.4s, v1.s[0]\n"
- "fmla v24.4s, v7.4s, v1.s[1]\n"
- "fmla v27.4s, v7.4s, v1.s[2]\n"
- "fmla v30.4s, v7.4s, v1.s[3]\n"
- "ldr q7, [x20, #0xb0]\n"
- "fmla v10.4s, v4.4s, v0.s[0]\n"
- "fmla v13.4s, v4.4s, v0.s[1]\n"
- "add x20, x20, #0xc0\n"
- "fmla v16.4s, v4.4s, v0.s[2]\n"
- "fmla v19.4s, v4.4s, v0.s[3]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "fmla v22.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v28.4s, v4.4s, v1.s[2]\n"
- "fmla v31.4s, v4.4s, v1.s[3]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "fmla v8.4s, v5.4s, v2.s[0]\n"
- "fmla v11.4s, v5.4s, v2.s[1]\n"
- "ldr q4, [x20, #0x0]\n"
- "fmla v14.4s, v5.4s, v2.s[2]\n"
- "fmla v17.4s, v5.4s, v2.s[3]\n"
- "fmla v20.4s, v5.4s, v3.s[0]\n"
- "fmla v23.4s, v5.4s, v3.s[1]\n"
- "fmla v26.4s, v5.4s, v3.s[2]\n"
- "fmla v29.4s, v5.4s, v3.s[3]\n"
- "ldr q5, [x20, #0x10]\n"
- "fmla v9.4s, v6.4s, v2.s[0]\n"
- "fmla v12.4s, v6.4s, v2.s[1]\n"
- "fmla v15.4s, v6.4s, v2.s[2]\n"
- "fmla v18.4s, v6.4s, v2.s[3]\n"
- "fmla v21.4s, v6.4s, v3.s[0]\n"
- "fmla v24.4s, v6.4s, v3.s[1]\n"
- "fmla v27.4s, v6.4s, v3.s[2]\n"
- "fmla v30.4s, v6.4s, v3.s[3]\n"
- "ldr q6, [x20, #0x20]\n"
- "fmla v10.4s, v7.4s, v2.s[0]\n"
- "fmla v13.4s, v7.4s, v2.s[1]\n"
- "fmla v16.4s, v7.4s, v2.s[2]\n"
- "fmla v19.4s, v7.4s, v2.s[3]\n"
- "fmla v22.4s, v7.4s, v3.s[0]\n"
- "fmla v25.4s, v7.4s, v3.s[1]\n"
- "fmla v28.4s, v7.4s, v3.s[2]\n"
- "fmla v31.4s, v7.4s, v3.s[3]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla v8.4s, v4.4s, v0.s[0]\n"
- "fmla v11.4s, v4.4s, v0.s[1]\n"
- "add x20, x20, #0x30\n"
- "fmla v14.4s, v4.4s, v0.s[2]\n"
- "fmla v17.4s, v4.4s, v0.s[3]\n"
- "fmla v20.4s, v4.4s, v1.s[0]\n"
- "fmla v23.4s, v4.4s, v1.s[1]\n"
- "fmla v26.4s, v4.4s, v1.s[2]\n"
- "fmla v29.4s, v4.4s, v1.s[3]\n"
- "fmla v9.4s, v5.4s, v0.s[0]\n"
- "fmla v12.4s, v5.4s, v0.s[1]\n"
- "fmla v15.4s, v5.4s, v0.s[2]\n"
- "fmla v18.4s, v5.4s, v0.s[3]\n"
- "fmla v21.4s, v5.4s, v1.s[0]\n"
- "fmla v24.4s, v5.4s, v1.s[1]\n"
- "fmla v27.4s, v5.4s, v1.s[2]\n"
- "fmla v30.4s, v5.4s, v1.s[3]\n"
- "fmla v10.4s, v6.4s, v0.s[0]\n"
- "fmla v13.4s, v6.4s, v0.s[1]\n"
- "fmla v16.4s, v6.4s, v0.s[2]\n"
- "fmla v19.4s, v6.4s, v0.s[3]\n"
- "fmla v22.4s, v6.4s, v1.s[0]\n"
- "fmla v25.4s, v6.4s, v1.s[1]\n"
- "fmla v28.4s, v6.4s, v1.s[2]\n"
- "fmla v31.4s, v6.4s, v1.s[3]\n"
- "cbz x19, 6f\n"
- "5:" // odd loop
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "subs x19, x19, #0x1\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q4, [x20, #0x10]\n"
- "fmla v8.4s, v7.4s, v0.s[0]\n"
- "ldr q5, [x20, #0x20]\n"
- "fmla v11.4s, v7.4s, v0.s[1]\n"
- "fmla v14.4s, v7.4s, v0.s[2]\n"
- "fmla v17.4s, v7.4s, v0.s[3]\n"
- "fmla v20.4s, v7.4s, v1.s[0]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla v23.4s, v7.4s, v1.s[1]\n"
- "fmla v26.4s, v7.4s, v1.s[2]\n"
- "add x20, x20, #0x30\n"
- "fmla v29.4s, v7.4s, v1.s[3]\n"
- "fmla v9.4s, v4.4s, v0.s[0]\n"
- "fmla v12.4s, v4.4s, v0.s[1]\n"
- "fmla v15.4s, v4.4s, v0.s[2]\n"
- "fmla v18.4s, v4.4s, v0.s[3]\n"
- "fmla v21.4s, v4.4s, v1.s[0]\n"
- "fmla v24.4s, v4.4s, v1.s[1]\n"
- "fmla v27.4s, v4.4s, v1.s[2]\n"
- "fmla v30.4s, v4.4s, v1.s[3]\n"
- "fmla v10.4s, v5.4s, v0.s[0]\n"
- "fmla v13.4s, v5.4s, v0.s[1]\n"
- "fmla v16.4s, v5.4s, v0.s[2]\n"
- "fmla v19.4s, v5.4s, v0.s[3]\n"
- "fmla v22.4s, v5.4s, v1.s[0]\n"
- "fmla v25.4s, v5.4s, v1.s[1]\n"
- "fmla v28.4s, v5.4s, v1.s[2]\n"
- "fmla v31.4s, v5.4s, v1.s[3]\n"
- "bne 5b\n"
- "6:" // multiply loop done
- "subs x22, x22, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp
deleted file mode 100644
index 8ba36cb87d..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-
-namespace arm_gemm {
-
-void a64_interleaved_fp32_mla_8x12_x1(
- const float *Apanel, const float *Bpanel,
- float *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const float *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/1) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x21, %x[Apanel]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x21\n"
- "cmp x19, #0x4\n"
- "movi v8.16b, #0x0\n"
- "movi v9.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.16b, #0x0\n"
- "movi v11.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x0]\n"
- "movi v12.16b, #0x0\n"
- "movi v13.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x40]\n"
- "movi v14.16b, #0x0\n"
- "movi v15.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
- "movi v16.16b, #0x0\n"
- "movi v17.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "movi v18.16b, #0x0\n"
- "movi v19.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- "movi v20.16b, #0x0\n"
- "movi v21.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0xc0]\n"
- "movi v22.16b, #0x0\n"
- "movi v23.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x100]\n"
- "movi v24.16b, #0x0\n"
- "movi v25.16b, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0xc0]\n"
- "movi v26.16b, #0x0\n"
- "movi v27.16b, #0x0\n"
- "prfm pldl1keep, [x20, #0x140]\n"
- "movi v28.16b, #0x0\n"
- "movi v29.16b, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v30.16b, #0x0\n"
- "movi v31.16b, #0x0\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q2, [x20, #0x0]\n"
- "ldr q3, [x20, #0x10]\n"
- "ldr q4, [x20, #0x20]\n"
- "blt 4f\n"
- "3:" // main loop head
- "fmla v8.4s, v2.4s, v0.s[0]\n"
- "fmla v11.4s, v2.4s, v0.s[1]\n"
- "sub x19, x19, #0x4\n"
- "fmla v14.4s, v2.4s, v0.s[2]\n"
- "fmla v17.4s, v2.4s, v0.s[3]\n"
- "cmp x19, #0x4\n"
- "fmla v20.4s, v2.4s, v1.s[0]\n"
- "fmla v23.4s, v2.4s, v1.s[1]\n"
- "prfm pldl1keep, [%x[Apanel], #0x100]\n"
- "fmla v26.4s, v2.4s, v1.s[2]\n"
- "fmla v29.4s, v2.4s, v1.s[3]\n"
- "ldr q2, [x20, #0x30]\n"
- "fmla v9.4s, v3.4s, v0.s[0]\n"
- "fmla v12.4s, v3.4s, v0.s[1]\n"
- "prfm pldl1keep, [x20, #0x180]\n"
- "fmla v15.4s, v3.4s, v0.s[2]\n"
- "fmla v18.4s, v3.4s, v0.s[3]\n"
- "prfm pldl1keep, [x20, #0x1c0]\n"
- "fmla v21.4s, v3.4s, v1.s[0]\n"
- "fmla v24.4s, v3.4s, v1.s[1]\n"
- "prfm pldl1keep, [%x[Apanel], #0x140]\n"
- "fmla v27.4s, v3.4s, v1.s[2]\n"
- "fmla v30.4s, v3.4s, v1.s[3]\n"
- "ldr q3, [x20, #0x40]\n"
- "fmla v10.4s, v4.4s, v0.s[0]\n"
- "fmla v13.4s, v4.4s, v0.s[1]\n"
- "prfm pldl1keep, [x20, #0x200]\n"
- "fmla v16.4s, v4.4s, v0.s[2]\n"
- "fmla v19.4s, v4.4s, v0.s[3]\n"
- "ldr q0, [%x[Apanel], #0x20]\n"
- "fmla v22.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v28.4s, v4.4s, v1.s[2]\n"
- "fmla v31.4s, v4.4s, v1.s[3]\n"
- "ldr q1, [%x[Apanel], #0x30]\n"
- "ldr q4, [x20, #0x50]\n"
- "fmla v8.4s, v2.4s, v0.s[0]\n"
- "fmla v11.4s, v2.4s, v0.s[1]\n"
- "fmla v14.4s, v2.4s, v0.s[2]\n"
- "fmla v17.4s, v2.4s, v0.s[3]\n"
- "fmla v20.4s, v2.4s, v1.s[0]\n"
- "fmla v23.4s, v2.4s, v1.s[1]\n"
- "fmla v26.4s, v2.4s, v1.s[2]\n"
- "fmla v29.4s, v2.4s, v1.s[3]\n"
- "ldr q2, [x20, #0x60]\n"
- "fmla v9.4s, v3.4s, v0.s[0]\n"
- "fmla v12.4s, v3.4s, v0.s[1]\n"
- "fmla v15.4s, v3.4s, v0.s[2]\n"
- "fmla v18.4s, v3.4s, v0.s[3]\n"
- "fmla v21.4s, v3.4s, v1.s[0]\n"
- "fmla v24.4s, v3.4s, v1.s[1]\n"
- "fmla v27.4s, v3.4s, v1.s[2]\n"
- "fmla v30.4s, v3.4s, v1.s[3]\n"
- "ldr q3, [x20, #0x70]\n"
- "fmla v10.4s, v4.4s, v0.s[0]\n"
- "fmla v13.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v0.s[2]\n"
- "fmla v19.4s, v4.4s, v0.s[3]\n"
- "ldr q0, [%x[Apanel], #0x40]\n"
- "fmla v22.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v28.4s, v4.4s, v1.s[2]\n"
- "fmla v31.4s, v4.4s, v1.s[3]\n"
- "ldr q1, [%x[Apanel], #0x50]\n"
- "ldr q4, [x20, #0x80]\n"
- "fmla v8.4s, v2.4s, v0.s[0]\n"
- "fmla v11.4s, v2.4s, v0.s[1]\n"
- "fmla v14.4s, v2.4s, v0.s[2]\n"
- "fmla v17.4s, v2.4s, v0.s[3]\n"
- "fmla v20.4s, v2.4s, v1.s[0]\n"
- "fmla v23.4s, v2.4s, v1.s[1]\n"
- "fmla v26.4s, v2.4s, v1.s[2]\n"
- "fmla v29.4s, v2.4s, v1.s[3]\n"
- "ldr q2, [x20, #0x90]\n"
- "fmla v9.4s, v3.4s, v0.s[0]\n"
- "fmla v12.4s, v3.4s, v0.s[1]\n"
- "fmla v15.4s, v3.4s, v0.s[2]\n"
- "fmla v18.4s, v3.4s, v0.s[3]\n"
- "fmla v21.4s, v3.4s, v1.s[0]\n"
- "fmla v24.4s, v3.4s, v1.s[1]\n"
- "fmla v27.4s, v3.4s, v1.s[2]\n"
- "fmla v30.4s, v3.4s, v1.s[3]\n"
- "ldr q3, [x20, #0xa0]\n"
- "fmla v10.4s, v4.4s, v0.s[0]\n"
- "fmla v13.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v0.s[2]\n"
- "fmla v19.4s, v4.4s, v0.s[3]\n"
- "ldr q0, [%x[Apanel], #0x60]\n"
- "fmla v22.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v28.4s, v4.4s, v1.s[2]\n"
- "fmla v31.4s, v4.4s, v1.s[3]\n"
- "ldr q1, [%x[Apanel], #0x70]\n"
- "ldr q4, [x20, #0xb0]\n"
- "add %x[Apanel], %x[Apanel], #0x80\n"
- "add x20, x20, #0xc0\n"
- "fmla v8.4s, v2.4s, v0.s[0]\n"
- "fmla v11.4s, v2.4s, v0.s[1]\n"
- "fmla v14.4s, v2.4s, v0.s[2]\n"
- "fmla v17.4s, v2.4s, v0.s[3]\n"
- "fmla v20.4s, v2.4s, v1.s[0]\n"
- "fmla v23.4s, v2.4s, v1.s[1]\n"
- "fmla v26.4s, v2.4s, v1.s[2]\n"
- "fmla v29.4s, v2.4s, v1.s[3]\n"
- "ldr q2, [x20, #0x0]\n"
- "fmla v9.4s, v3.4s, v0.s[0]\n"
- "fmla v12.4s, v3.4s, v0.s[1]\n"
- "fmla v15.4s, v3.4s, v0.s[2]\n"
- "fmla v18.4s, v3.4s, v0.s[3]\n"
- "fmla v21.4s, v3.4s, v1.s[0]\n"
- "fmla v24.4s, v3.4s, v1.s[1]\n"
- "fmla v27.4s, v3.4s, v1.s[2]\n"
- "fmla v30.4s, v3.4s, v1.s[3]\n"
- "ldr q3, [x20, #0x10]\n"
- "fmla v10.4s, v4.4s, v0.s[0]\n"
- "fmla v13.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v0.s[2]\n"
- "fmla v19.4s, v4.4s, v0.s[3]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "fmla v22.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v28.4s, v4.4s, v1.s[2]\n"
- "fmla v31.4s, v4.4s, v1.s[3]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q4, [x20, #0x20]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla v8.4s, v2.4s, v0.s[0]\n"
- "fmla v11.4s, v2.4s, v0.s[1]\n"
- "add x20, x20, #0x30\n"
- "fmla v14.4s, v2.4s, v0.s[2]\n"
- "fmla v17.4s, v2.4s, v0.s[3]\n"
- "fmla v20.4s, v2.4s, v1.s[0]\n"
- "fmla v23.4s, v2.4s, v1.s[1]\n"
- "fmla v26.4s, v2.4s, v1.s[2]\n"
- "fmla v29.4s, v2.4s, v1.s[3]\n"
- "fmla v9.4s, v3.4s, v0.s[0]\n"
- "fmla v12.4s, v3.4s, v0.s[1]\n"
- "fmla v15.4s, v3.4s, v0.s[2]\n"
- "fmla v18.4s, v3.4s, v0.s[3]\n"
- "fmla v21.4s, v3.4s, v1.s[0]\n"
- "fmla v24.4s, v3.4s, v1.s[1]\n"
- "fmla v27.4s, v3.4s, v1.s[2]\n"
- "fmla v30.4s, v3.4s, v1.s[3]\n"
- "fmla v10.4s, v4.4s, v0.s[0]\n"
- "fmla v13.4s, v4.4s, v0.s[1]\n"
- "fmla v16.4s, v4.4s, v0.s[2]\n"
- "fmla v19.4s, v4.4s, v0.s[3]\n"
- "fmla v22.4s, v4.4s, v1.s[0]\n"
- "fmla v25.4s, v4.4s, v1.s[1]\n"
- "fmla v28.4s, v4.4s, v1.s[2]\n"
- "fmla v31.4s, v4.4s, v1.s[3]\n"
- "cbz x19, 6f\n"
- "5:" // odd loop
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "subs x19, x19, #0x1\n"
- "ldr q5, [x20, #0x0]\n"
- "ldr q6, [x20, #0x10]\n"
- "fmla v8.4s, v5.4s, v0.s[0]\n"
- "ldr q7, [x20, #0x20]\n"
- "fmla v11.4s, v5.4s, v0.s[1]\n"
- "fmla v14.4s, v5.4s, v0.s[2]\n"
- "fmla v17.4s, v5.4s, v0.s[3]\n"
- "fmla v20.4s, v5.4s, v1.s[0]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "fmla v23.4s, v5.4s, v1.s[1]\n"
- "fmla v26.4s, v5.4s, v1.s[2]\n"
- "add x20, x20, #0x30\n"
- "fmla v29.4s, v5.4s, v1.s[3]\n"
- "fmla v9.4s, v6.4s, v0.s[0]\n"
- "fmla v12.4s, v6.4s, v0.s[1]\n"
- "fmla v15.4s, v6.4s, v0.s[2]\n"
- "fmla v18.4s, v6.4s, v0.s[3]\n"
- "fmla v21.4s, v6.4s, v1.s[0]\n"
- "fmla v24.4s, v6.4s, v1.s[1]\n"
- "fmla v27.4s, v6.4s, v1.s[2]\n"
- "fmla v30.4s, v6.4s, v1.s[3]\n"
- "fmla v10.4s, v7.4s, v0.s[0]\n"
- "fmla v13.4s, v7.4s, v0.s[1]\n"
- "fmla v16.4s, v7.4s, v0.s[2]\n"
- "fmla v19.4s, v7.4s, v0.s[3]\n"
- "fmla v22.4s, v7.4s, v1.s[0]\n"
- "fmla v25.4s, v7.4s, v1.s[1]\n"
- "fmla v28.4s, v7.4s, v1.s[2]\n"
- "fmla v31.4s, v7.4s, v1.s[3]\n"
- "bne 5b\n"
- "6:" // multiply loop done
- "subs x22, x22, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp
deleted file mode 100644
index bc6b9931e1..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-#include "../std_transforms_fixed.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- const int8_t *, const int8_t *, \
- int32_t *, int, int, int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void a64_interleaved_s8s32_dot_8x12( ARGLIST );
-void a64_interleaved_s8s32_dot_8x12_a55( ARGLIST );
-void a64_interleaved_s8s32_dot_8x12_x1( ARGLIST );
-
-class cls_a64_interleaved_s8s32_dot_8x12
-{
-public:
- typedef int8_t operand_type;
- typedef int32_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
-
- static unsigned int out_width()
- {
- return 12;
- }
-
- static unsigned int stripe_width()
- {
- return 4;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
-
- StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
- StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
-
- if (std::is_same<T, int8_t>::value) {
- switch (ci->get_cpu_model()) {
- case CPUModel::A55r1:
- return { 15.361, 0.9341, 0.1636 };
- default:
- return { 29.0698, 3.9793, 0.4003 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=a64_interleaved_s8s32_dot_8x12;
- cls_a64_interleaved_s8s32_dot_8x12(const CPUInfo *ci)
- {
- switch(ci->get_cpu_model()) {
- default:
- break;
- case CPUModel::A55r1:
- kernel=a64_interleaved_s8s32_dot_8x12_a55;
- break;
- case CPUModel::X1:
- kernel=a64_interleaved_s8s32_dot_8x12_x1;
- break;
- }
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp
deleted file mode 100644
index 3acd61c88c..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void a64_interleaved_s8s32_dot_8x12_a55(
- const int8_t *Apanel, const int8_t *Bpanel,
- int32_t *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const int8_t *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/4) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x27, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x26, %x[Apanel]\n"
- "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x24, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x26\n"
- "cmp x24, #0x2\n"
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.4s, #0x0\n"
- "prfm pldl1keep, [x25, #0x0]\n"
- "movi v11.4s, #0x0\n"
- "prfm pldl1keep, [x25, #0x40]\n"
- "movi v12.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
- "movi v13.4s, #0x0\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "movi v14.4s, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v15.4s, #0x0\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "movi v16.4s, #0x0\n"
- "ldr q4, [x25, #0x0]\n"
- "movi v17.4s, #0x0\n"
- "ldr q5, [x25, #0x10]\n"
- "movi v18.4s, #0x0\n"
- "ldr q6, [x25, #0x20]\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n"
- "ldr d2, [%x[Apanel], #0x20]\n"
- "ldr x23, [%x[Apanel], #0x28]\n"
- ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n"
- "ldr d3, [%x[Apanel], #0x30]\n"
- ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n"
- "ldr x19, [%x[Apanel], #0x38]\n"
- ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x22, [x25, #0x38]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr x20, [x25, #0x48]\n"
- ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n"
- "ldr x21, [x25, #0x58]\n"
- ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n"
- "ldr d4, [x25, #0x30]\n"
- ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n"
- "mov v2.d[1], x23\n"
- ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n"
- "mov v3.d[1], x19\n"
- ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n"
- "mov v4.d[1], x22\n"
- ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x25, #0x100]\n"
- ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x25, #0x140]\n"
- ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n"
- "ldr d5, [x25, #0x40]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- "mov v5.d[1], x20\n"
- ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n"
- "ldr x20, [%x[Apanel], #0x8]\n"
- ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n"
- "ldr x19, [%x[Apanel], #0x18]\n"
- ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n"
- "ldr d0, [%x[Apanel], #0x0]\n"
- ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- "sub x24, x24, #0x2\n"
- ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
- "cmp x24, #0x2\n"
- ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n"
- "mov v0.d[1], x20\n"
- ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n"
- "ldr d6, [x25, #0x50]\n"
- "mov v6.d[1], x21\n"
- "add x25, x25, #0x60\n"
- ".inst 0x4f82e088 // sdot v8.4s, v4.16b, v2.4b[0]\n"
- "ldr d1, [%x[Apanel], #0x10]\n"
- ".inst 0x4fa2e08b // sdot v11.4s, v4.16b, v2.4b[1]\n"
- "ldr x22, [x25, #0x8]\n"
- ".inst 0x4f82e88e // sdot v14.4s, v4.16b, v2.4b[2]\n"
- "ldr x20, [x25, #0x18]\n"
- ".inst 0x4fa2e891 // sdot v17.4s, v4.16b, v2.4b[3]\n"
- "ldr x21, [x25, #0x28]\n"
- ".inst 0x4f83e094 // sdot v20.4s, v4.16b, v3.4b[0]\n"
- "mov v1.d[1], x19\n"
- ".inst 0x4fa3e097 // sdot v23.4s, v4.16b, v3.4b[1]\n"
- ".inst 0x4f83e89a // sdot v26.4s, v4.16b, v3.4b[2]\n"
- ".inst 0x4fa3e89d // sdot v29.4s, v4.16b, v3.4b[3]\n"
- "ldr d4, [x25, #0x0]\n"
- ".inst 0x4f82e0a9 // sdot v9.4s, v5.16b, v2.4b[0]\n"
- "mov v4.d[1], x22\n"
- ".inst 0x4fa2e0ac // sdot v12.4s, v5.16b, v2.4b[1]\n"
- ".inst 0x4f82e8af // sdot v15.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x4fa2e8b2 // sdot v18.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x4f83e0b5 // sdot v21.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x4fa3e0b8 // sdot v24.4s, v5.16b, v3.4b[1]\n"
- ".inst 0x4f83e8bb // sdot v27.4s, v5.16b, v3.4b[2]\n"
- ".inst 0x4fa3e8be // sdot v30.4s, v5.16b, v3.4b[3]\n"
- "ldr d5, [x25, #0x10]\n"
- ".inst 0x4f82e0ca // sdot v10.4s, v6.16b, v2.4b[0]\n"
- "mov v5.d[1], x20\n"
- ".inst 0x4fa2e0cd // sdot v13.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4fa2e8d3 // sdot v19.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4fa3e0d9 // sdot v25.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4f83e8dc // sdot v28.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
- "ldr d6, [x25, #0x20]\n"
- "mov v6.d[1], x21\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n"
- "add x25, x25, #0x30\n"
- ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n"
- "cbz x24, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q7, [x25, #0x0]\n"
- ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n"
- "ldr q4, [x25, #0x10]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q5, [x25, #0x20]\n"
- ".inst 0x4f80e8ee // sdot v14.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8f1 // sdot v17.4s, v7.16b, v0.4b[3]\n"
- "add x25, x25, #0x30\n"
- ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0f7 // sdot v23.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4f81e8fa // sdot v26.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8fd // sdot v29.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4fa0e08c // sdot v12.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4f80e88f // sdot v15.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4fa1e098 // sdot v24.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4f81e89b // sdot v27.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4fa1e89e // sdot v30.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4f80e0aa // sdot v10.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0ad // sdot v13.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0b9 // sdot v25.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x4f81e8bc // sdot v28.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8bf // sdot v31.4s, v5.16b, v1.4b[3]\n"
- "5:" // multiply loop done
- "subs x27, x27, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp
deleted file mode 100644
index 267f62ae8a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void a64_interleaved_s8s32_dot_8x12(
- const int8_t *Apanel, const int8_t *Bpanel,
- int32_t *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const int8_t *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/4) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x21, %x[Apanel]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x21\n"
- "cmp x19, #0x2\n"
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x0]\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x40]\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "ldr q4, [x20, #0x0]\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "ldr q5, [x20, #0x10]\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "ldr q6, [x20, #0x20]\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
- ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- "sub x19, x19, #0x2\n"
- ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n"
- "ldr q4, [x20, #0x30]\n"
- ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n"
- "cmp x19, #0x2\n"
- ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n"
- "ldr q5, [x20, #0x40]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x20, #0x100]\n"
- ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n"
- "prfm pldl1keep, [x20, #0x140]\n"
- ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x20, #0x50]\n"
- "add x20, x20, #0x60\n"
- ".inst 0x4f82e088 // sdot v8.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x4fa2e08b // sdot v11.4s, v4.16b, v2.4b[1]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- ".inst 0x4f82e88e // sdot v14.4s, v4.16b, v2.4b[2]\n"
- ".inst 0x4fa2e891 // sdot v17.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x4f83e094 // sdot v20.4s, v4.16b, v3.4b[0]\n"
- ".inst 0x4fa3e097 // sdot v23.4s, v4.16b, v3.4b[1]\n"
- ".inst 0x4f83e89a // sdot v26.4s, v4.16b, v3.4b[2]\n"
- ".inst 0x4fa3e89d // sdot v29.4s, v4.16b, v3.4b[3]\n"
- "ldr q4, [x20, #0x0]\n"
- ".inst 0x4f82e0a9 // sdot v9.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x4fa2e0ac // sdot v12.4s, v5.16b, v2.4b[1]\n"
- ".inst 0x4f82e8af // sdot v15.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x4fa2e8b2 // sdot v18.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x4f83e0b5 // sdot v21.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x4fa3e0b8 // sdot v24.4s, v5.16b, v3.4b[1]\n"
- ".inst 0x4f83e8bb // sdot v27.4s, v5.16b, v3.4b[2]\n"
- ".inst 0x4fa3e8be // sdot v30.4s, v5.16b, v3.4b[3]\n"
- "ldr q5, [x20, #0x10]\n"
- ".inst 0x4f82e0ca // sdot v10.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x4fa2e0cd // sdot v13.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x4fa2e8d3 // sdot v19.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x4fa3e0d9 // sdot v25.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x4f83e8dc // sdot v28.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x20, #0x20]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n"
- "add x20, x20, #0x30\n"
- ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n"
- "cbz x19, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q4, [x20, #0x10]\n"
- ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n"
- "ldr q5, [x20, #0x20]\n"
- ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4f80e8ee // sdot v14.4s, v7.16b, v0.4b[2]\n"
- "add x20, x20, #0x30\n"
- ".inst 0x4fa0e8f1 // sdot v17.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0f7 // sdot v23.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4f81e8fa // sdot v26.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8fd // sdot v29.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4fa0e08c // sdot v12.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4f80e88f // sdot v15.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4fa1e098 // sdot v24.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4f81e89b // sdot v27.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4fa1e89e // sdot v30.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x4f80e0aa // sdot v10.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0ad // sdot v13.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0b9 // sdot v25.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x4f81e8bc // sdot v28.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8bf // sdot v31.4s, v5.16b, v1.4b[3]\n"
- "5:" // multiply loop done
- "subs x22, x22, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp
deleted file mode 100644
index 4804c059a3..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void a64_interleaved_s8s32_dot_8x12_x1(
- const int8_t *Apanel, const int8_t *Bpanel,
- int32_t *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const int8_t *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/4) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x21, %x[Apanel]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x21\n"
- "cmp x19, #0x2\n"
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x0]\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x40]\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "ldr q2, [x20, #0x0]\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "ldr q3, [x20, #0x10]\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "ldr q4, [x20, #0x20]\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- ".inst 0x4f80e048 // sdot v8.4s, v2.16b, v0.4b[0]\n"
- ".inst 0x4fa0e04b // sdot v11.4s, v2.16b, v0.4b[1]\n"
- "sub x19, x19, #0x2\n"
- ".inst 0x4f80e84e // sdot v14.4s, v2.16b, v0.4b[2]\n"
- ".inst 0x4fa0e851 // sdot v17.4s, v2.16b, v0.4b[3]\n"
- "cmp x19, #0x2\n"
- ".inst 0x4f81e054 // sdot v20.4s, v2.16b, v1.4b[0]\n"
- ".inst 0x4fa1e057 // sdot v23.4s, v2.16b, v1.4b[1]\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- ".inst 0x4f81e85a // sdot v26.4s, v2.16b, v1.4b[2]\n"
- ".inst 0x4fa1e85d // sdot v29.4s, v2.16b, v1.4b[3]\n"
- "ldr q2, [x20, #0x30]\n"
- ".inst 0x4f80e069 // sdot v9.4s, v3.16b, v0.4b[0]\n"
- ".inst 0x4fa0e06c // sdot v12.4s, v3.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x20, #0x100]\n"
- ".inst 0x4f80e86f // sdot v15.4s, v3.16b, v0.4b[2]\n"
- ".inst 0x4fa0e872 // sdot v18.4s, v3.16b, v0.4b[3]\n"
- "prfm pldl1keep, [x20, #0x140]\n"
- ".inst 0x4f81e075 // sdot v21.4s, v3.16b, v1.4b[0]\n"
- ".inst 0x4fa1e078 // sdot v24.4s, v3.16b, v1.4b[1]\n"
- ".inst 0x4f81e87b // sdot v27.4s, v3.16b, v1.4b[2]\n"
- ".inst 0x4fa1e87e // sdot v30.4s, v3.16b, v1.4b[3]\n"
- "ldr q3, [x20, #0x40]\n"
- ".inst 0x4f80e08a // sdot v10.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4f80e890 // sdot v16.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]\n"
- "ldr q0, [%x[Apanel], #0x20]\n"
- ".inst 0x4f81e096 // sdot v22.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4f81e89c // sdot v28.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4fa1e89f // sdot v31.4s, v4.16b, v1.4b[3]\n"
- "ldr q1, [%x[Apanel], #0x30]\n"
- "ldr q4, [x20, #0x50]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- "add x20, x20, #0x60\n"
- ".inst 0x4f80e048 // sdot v8.4s, v2.16b, v0.4b[0]\n"
- ".inst 0x4fa0e04b // sdot v11.4s, v2.16b, v0.4b[1]\n"
- ".inst 0x4f80e84e // sdot v14.4s, v2.16b, v0.4b[2]\n"
- ".inst 0x4fa0e851 // sdot v17.4s, v2.16b, v0.4b[3]\n"
- ".inst 0x4f81e054 // sdot v20.4s, v2.16b, v1.4b[0]\n"
- ".inst 0x4fa1e057 // sdot v23.4s, v2.16b, v1.4b[1]\n"
- ".inst 0x4f81e85a // sdot v26.4s, v2.16b, v1.4b[2]\n"
- ".inst 0x4fa1e85d // sdot v29.4s, v2.16b, v1.4b[3]\n"
- "ldr q2, [x20, #0x0]\n"
- ".inst 0x4f80e069 // sdot v9.4s, v3.16b, v0.4b[0]\n"
- ".inst 0x4fa0e06c // sdot v12.4s, v3.16b, v0.4b[1]\n"
- ".inst 0x4f80e86f // sdot v15.4s, v3.16b, v0.4b[2]\n"
- ".inst 0x4fa0e872 // sdot v18.4s, v3.16b, v0.4b[3]\n"
- ".inst 0x4f81e075 // sdot v21.4s, v3.16b, v1.4b[0]\n"
- ".inst 0x4fa1e078 // sdot v24.4s, v3.16b, v1.4b[1]\n"
- ".inst 0x4f81e87b // sdot v27.4s, v3.16b, v1.4b[2]\n"
- ".inst 0x4fa1e87e // sdot v30.4s, v3.16b, v1.4b[3]\n"
- "ldr q3, [x20, #0x10]\n"
- ".inst 0x4f80e08a // sdot v10.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4f80e890 // sdot v16.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- ".inst 0x4f81e096 // sdot v22.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4f81e89c // sdot v28.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4fa1e89f // sdot v31.4s, v4.16b, v1.4b[3]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q4, [x20, #0x20]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- ".inst 0x4f80e048 // sdot v8.4s, v2.16b, v0.4b[0]\n"
- ".inst 0x4fa0e04b // sdot v11.4s, v2.16b, v0.4b[1]\n"
- "add x20, x20, #0x30\n"
- ".inst 0x4f80e84e // sdot v14.4s, v2.16b, v0.4b[2]\n"
- ".inst 0x4fa0e851 // sdot v17.4s, v2.16b, v0.4b[3]\n"
- ".inst 0x4f81e054 // sdot v20.4s, v2.16b, v1.4b[0]\n"
- ".inst 0x4fa1e057 // sdot v23.4s, v2.16b, v1.4b[1]\n"
- ".inst 0x4f81e85a // sdot v26.4s, v2.16b, v1.4b[2]\n"
- ".inst 0x4fa1e85d // sdot v29.4s, v2.16b, v1.4b[3]\n"
- ".inst 0x4f80e069 // sdot v9.4s, v3.16b, v0.4b[0]\n"
- ".inst 0x4fa0e06c // sdot v12.4s, v3.16b, v0.4b[1]\n"
- ".inst 0x4f80e86f // sdot v15.4s, v3.16b, v0.4b[2]\n"
- ".inst 0x4fa0e872 // sdot v18.4s, v3.16b, v0.4b[3]\n"
- ".inst 0x4f81e075 // sdot v21.4s, v3.16b, v1.4b[0]\n"
- ".inst 0x4fa1e078 // sdot v24.4s, v3.16b, v1.4b[1]\n"
- ".inst 0x4f81e87b // sdot v27.4s, v3.16b, v1.4b[2]\n"
- ".inst 0x4fa1e87e // sdot v30.4s, v3.16b, v1.4b[3]\n"
- ".inst 0x4f80e08a // sdot v10.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x4f80e890 // sdot v16.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x4f81e096 // sdot v22.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x4f81e89c // sdot v28.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x4fa1e89f // sdot v31.4s, v4.16b, v1.4b[3]\n"
- "cbz x19, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q5, [x20, #0x0]\n"
- "ldr q6, [x20, #0x10]\n"
- ".inst 0x4f80e0a8 // sdot v8.4s, v5.16b, v0.4b[0]\n"
- "ldr q7, [x20, #0x20]\n"
- ".inst 0x4fa0e0ab // sdot v11.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x4f80e8ae // sdot v14.4s, v5.16b, v0.4b[2]\n"
- "add x20, x20, #0x30\n"
- ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0b7 // sdot v23.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x4f81e8ba // sdot v26.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8bd // sdot v29.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0cc // sdot v12.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x4f80e8cf // sdot v15.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0d8 // sdot v24.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x4f81e8db // sdot v27.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8de // sdot v30.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x4fa0e0ed // sdot v13.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x4fa1e0f9 // sdot v25.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x4f81e8fc // sdot v28.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x4fa1e8ff // sdot v31.4s, v7.16b, v1.4b[3]\n"
- "5:" // multiply loop done
- "subs x22, x22, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp
deleted file mode 100644
index 000cc680da..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-#include "../std_transforms_fixed.hpp"
-#include "../performance_parameters.hpp"
-
-#define ARGLIST \
- const uint8_t *, const uint8_t *, \
- uint32_t *, int, int, int
-
-namespace arm_gemm
-{
-// Actual kernel implementations
-void a64_interleaved_u8u32_dot_8x12( ARGLIST );
-void a64_interleaved_u8u32_dot_8x12_a55( ARGLIST );
-void a64_interleaved_u8u32_dot_8x12_x1( ARGLIST );
-
-class cls_a64_interleaved_u8u32_dot_8x12
-{
-public:
- typedef uint8_t operand_type;
- typedef uint32_t result_type;
-
- typedef void (*kern_type)( ARGLIST );
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
-
- static unsigned int out_width()
- {
- return 12;
- }
-
- static unsigned int stripe_width()
- {
- return 4;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
-
- StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
- StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
- template<typename T>
- static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
- {
-
- if (std::is_same<T, uint8_t>::value) {
- switch (ci->get_cpu_model()) {
- case CPUModel::A55r1:
- return { 15.361, 0.9341, 0.1636 };
- default:
- return { 29.0698, 3.9793, 0.4003 };
- }
- }
-
- return { 1.0 };
- }
-
- // Default to the generic kernel
- kern_type kernel=a64_interleaved_u8u32_dot_8x12;
- cls_a64_interleaved_u8u32_dot_8x12(const CPUInfo *ci)
- {
- switch(ci->get_cpu_model()) {
- default:
- break;
- case CPUModel::A55r1:
- kernel=a64_interleaved_u8u32_dot_8x12_a55;
- break;
- case CPUModel::X1:
- kernel=a64_interleaved_u8u32_dot_8x12_x1;
- break;
- }
- }
-};
-
-} // namespace arm_gemm
-
-#undef ARGLIST
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp
deleted file mode 100644
index 7892306153..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void a64_interleaved_u8u32_dot_8x12_a55(
- const uint8_t *Apanel, const uint8_t *Bpanel,
- uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const uint8_t *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/4) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x27, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x26, %x[Apanel]\n"
- "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x24, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x26\n"
- "cmp x24, #0x2\n"
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.4s, #0x0\n"
- "prfm pldl1keep, [x25, #0x0]\n"
- "movi v11.4s, #0x0\n"
- "prfm pldl1keep, [x25, #0x40]\n"
- "movi v12.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
- "movi v13.4s, #0x0\n"
- "prfm pldl1keep, [x25, #0x80]\n"
- "movi v14.4s, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v15.4s, #0x0\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "movi v16.4s, #0x0\n"
- "ldr q4, [x25, #0x0]\n"
- "movi v17.4s, #0x0\n"
- "ldr q5, [x25, #0x10]\n"
- "movi v18.4s, #0x0\n"
- "ldr q6, [x25, #0x20]\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n"
- "ldr d2, [%x[Apanel], #0x20]\n"
- "ldr x23, [%x[Apanel], #0x28]\n"
- ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n"
- "ldr d3, [%x[Apanel], #0x30]\n"
- ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n"
- "ldr x19, [%x[Apanel], #0x38]\n"
- ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- "ldr x22, [x25, #0x38]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- "ldr x20, [x25, #0x48]\n"
- ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n"
- "ldr x21, [x25, #0x58]\n"
- ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n"
- "ldr d4, [x25, #0x30]\n"
- ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n"
- "mov v2.d[1], x23\n"
- ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n"
- "mov v3.d[1], x19\n"
- ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n"
- "mov v4.d[1], x22\n"
- ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n"
- "prfm pldl1keep, [x25, #0x100]\n"
- ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n"
- "prfm pldl1keep, [x25, #0x140]\n"
- ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n"
- "ldr d5, [x25, #0x40]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- "mov v5.d[1], x20\n"
- ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n"
- "ldr x20, [%x[Apanel], #0x8]\n"
- ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n"
- "ldr x19, [%x[Apanel], #0x18]\n"
- ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n"
- "ldr d0, [%x[Apanel], #0x0]\n"
- ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- "sub x24, x24, #0x2\n"
- ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
- "cmp x24, #0x2\n"
- ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n"
- "mov v0.d[1], x20\n"
- ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n"
- "ldr d6, [x25, #0x50]\n"
- "mov v6.d[1], x21\n"
- "add x25, x25, #0x60\n"
- ".inst 0x6f82e088 // udot v8.4s, v4.16b, v2.4b[0]\n"
- "ldr d1, [%x[Apanel], #0x10]\n"
- ".inst 0x6fa2e08b // udot v11.4s, v4.16b, v2.4b[1]\n"
- "ldr x22, [x25, #0x8]\n"
- ".inst 0x6f82e88e // udot v14.4s, v4.16b, v2.4b[2]\n"
- "ldr x20, [x25, #0x18]\n"
- ".inst 0x6fa2e891 // udot v17.4s, v4.16b, v2.4b[3]\n"
- "ldr x21, [x25, #0x28]\n"
- ".inst 0x6f83e094 // udot v20.4s, v4.16b, v3.4b[0]\n"
- "mov v1.d[1], x19\n"
- ".inst 0x6fa3e097 // udot v23.4s, v4.16b, v3.4b[1]\n"
- ".inst 0x6f83e89a // udot v26.4s, v4.16b, v3.4b[2]\n"
- ".inst 0x6fa3e89d // udot v29.4s, v4.16b, v3.4b[3]\n"
- "ldr d4, [x25, #0x0]\n"
- ".inst 0x6f82e0a9 // udot v9.4s, v5.16b, v2.4b[0]\n"
- "mov v4.d[1], x22\n"
- ".inst 0x6fa2e0ac // udot v12.4s, v5.16b, v2.4b[1]\n"
- ".inst 0x6f82e8af // udot v15.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x6fa2e8b2 // udot v18.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x6f83e0b5 // udot v21.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x6fa3e0b8 // udot v24.4s, v5.16b, v3.4b[1]\n"
- ".inst 0x6f83e8bb // udot v27.4s, v5.16b, v3.4b[2]\n"
- ".inst 0x6fa3e8be // udot v30.4s, v5.16b, v3.4b[3]\n"
- "ldr d5, [x25, #0x10]\n"
- ".inst 0x6f82e0ca // udot v10.4s, v6.16b, v2.4b[0]\n"
- "mov v5.d[1], x20\n"
- ".inst 0x6fa2e0cd // udot v13.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6fa2e8d3 // udot v19.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6fa3e0d9 // udot v25.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x6f83e8dc // udot v28.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
- "ldr d6, [x25, #0x20]\n"
- "mov v6.d[1], x21\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n"
- "add x25, x25, #0x30\n"
- ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n"
- "cbz x24, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q7, [x25, #0x0]\n"
- ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n"
- "ldr q4, [x25, #0x10]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- "ldr q5, [x25, #0x20]\n"
- ".inst 0x6f80e8ee // udot v14.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8f1 // udot v17.4s, v7.16b, v0.4b[3]\n"
- "add x25, x25, #0x30\n"
- ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0f7 // udot v23.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6f81e8fa // udot v26.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8fd // udot v29.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6f80e089 // udot v9.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6fa0e08c // udot v12.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6f80e88f // udot v15.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6fa1e098 // udot v24.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6f81e89b // udot v27.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x6fa1e89e // udot v30.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6f80e0aa // udot v10.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0ad // udot v13.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0b9 // udot v25.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x6f81e8bc // udot v28.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8bf // udot v31.4s, v5.16b, v1.4b[3]\n"
- "5:" // multiply loop done
- "subs x27, x27, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp
deleted file mode 100644
index 42226e90f5..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void a64_interleaved_u8u32_dot_8x12(
- const uint8_t *Apanel, const uint8_t *Bpanel,
- uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const uint8_t *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/4) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x21, %x[Apanel]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x21\n"
- "cmp x19, #0x2\n"
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x0]\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x40]\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "ldr q4, [x20, #0x0]\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "ldr q5, [x20, #0x10]\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "ldr q6, [x20, #0x20]\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n"
- "ldr q2, [%x[Apanel], #0x20]\n"
- ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n"
- "ldr q3, [%x[Apanel], #0x30]\n"
- ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- "sub x19, x19, #0x2\n"
- ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n"
- "ldr q4, [x20, #0x30]\n"
- ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n"
- "cmp x19, #0x2\n"
- ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n"
- "ldr q5, [x20, #0x40]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x20, #0x100]\n"
- ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n"
- "prfm pldl1keep, [x20, #0x140]\n"
- ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n"
- "ldr q6, [x20, #0x50]\n"
- "add x20, x20, #0x60\n"
- ".inst 0x6f82e088 // udot v8.4s, v4.16b, v2.4b[0]\n"
- ".inst 0x6fa2e08b // udot v11.4s, v4.16b, v2.4b[1]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- ".inst 0x6f82e88e // udot v14.4s, v4.16b, v2.4b[2]\n"
- ".inst 0x6fa2e891 // udot v17.4s, v4.16b, v2.4b[3]\n"
- ".inst 0x6f83e094 // udot v20.4s, v4.16b, v3.4b[0]\n"
- ".inst 0x6fa3e097 // udot v23.4s, v4.16b, v3.4b[1]\n"
- ".inst 0x6f83e89a // udot v26.4s, v4.16b, v3.4b[2]\n"
- ".inst 0x6fa3e89d // udot v29.4s, v4.16b, v3.4b[3]\n"
- "ldr q4, [x20, #0x0]\n"
- ".inst 0x6f82e0a9 // udot v9.4s, v5.16b, v2.4b[0]\n"
- ".inst 0x6fa2e0ac // udot v12.4s, v5.16b, v2.4b[1]\n"
- ".inst 0x6f82e8af // udot v15.4s, v5.16b, v2.4b[2]\n"
- ".inst 0x6fa2e8b2 // udot v18.4s, v5.16b, v2.4b[3]\n"
- ".inst 0x6f83e0b5 // udot v21.4s, v5.16b, v3.4b[0]\n"
- ".inst 0x6fa3e0b8 // udot v24.4s, v5.16b, v3.4b[1]\n"
- ".inst 0x6f83e8bb // udot v27.4s, v5.16b, v3.4b[2]\n"
- ".inst 0x6fa3e8be // udot v30.4s, v5.16b, v3.4b[3]\n"
- "ldr q5, [x20, #0x10]\n"
- ".inst 0x6f82e0ca // udot v10.4s, v6.16b, v2.4b[0]\n"
- ".inst 0x6fa2e0cd // udot v13.4s, v6.16b, v2.4b[1]\n"
- ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
- ".inst 0x6fa2e8d3 // udot v19.4s, v6.16b, v2.4b[3]\n"
- ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
- ".inst 0x6fa3e0d9 // udot v25.4s, v6.16b, v3.4b[1]\n"
- ".inst 0x6f83e8dc // udot v28.4s, v6.16b, v3.4b[2]\n"
- ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
- "ldr q6, [x20, #0x20]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n"
- "add x20, x20, #0x30\n"
- ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n"
- "cbz x19, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q7, [x20, #0x0]\n"
- "ldr q4, [x20, #0x10]\n"
- ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n"
- "ldr q5, [x20, #0x20]\n"
- ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6f80e8ee // udot v14.4s, v7.16b, v0.4b[2]\n"
- "add x20, x20, #0x30\n"
- ".inst 0x6fa0e8f1 // udot v17.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0f7 // udot v23.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6f81e8fa // udot v26.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8fd // udot v29.4s, v7.16b, v1.4b[3]\n"
- ".inst 0x6f80e089 // udot v9.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6fa0e08c // udot v12.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6f80e88f // udot v15.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6fa1e098 // udot v24.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6f81e89b // udot v27.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x6fa1e89e // udot v30.4s, v4.16b, v1.4b[3]\n"
- ".inst 0x6f80e0aa // udot v10.4s, v5.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0ad // udot v13.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0b9 // udot v25.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x6f81e8bc // udot v28.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8bf // udot v31.4s, v5.16b, v1.4b[3]\n"
- "5:" // multiply loop done
- "subs x22, x22, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp
deleted file mode 100644
index 652f2bffc5..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_gemm {
-
-void a64_interleaved_u8u32_dot_8x12_x1(
- const uint8_t *Apanel, const uint8_t *Bpanel,
- uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-
- struct KernelArgs {
- size_t bblocks = {};
- size_t K = {};
- const uint8_t *Bpanel = {};
- } ka;
-
- ka.bblocks = bblocks;
- ka.K = (K/4) - 1;
- ka.Bpanel = Bpanel;
-
- __asm__ __volatile__(
-
- "1:" // Height loop
- "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
- "mov x21, %x[Apanel]\n"
- "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
- "2:" // Width loop
- "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
- "mov %x[Apanel], x21\n"
- "cmp x19, #0x2\n"
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x0]\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x0]\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x40]\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "prfm pldl1keep, [%x[Apanel], #0x40]\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "prfm pldl1keep, [x20, #0x80]\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "ldr q2, [x20, #0x0]\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "ldr q3, [x20, #0x10]\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "ldr q4, [x20, #0x20]\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
- "blt 4f\n"
- "3:" // main loop head
- ".inst 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n"
- ".inst 0x6fa0e04b // udot v11.4s, v2.16b, v0.4b[1]\n"
- "sub x19, x19, #0x2\n"
- ".inst 0x6f80e84e // udot v14.4s, v2.16b, v0.4b[2]\n"
- ".inst 0x6fa0e851 // udot v17.4s, v2.16b, v0.4b[3]\n"
- "cmp x19, #0x2\n"
- ".inst 0x6f81e054 // udot v20.4s, v2.16b, v1.4b[0]\n"
- ".inst 0x6fa1e057 // udot v23.4s, v2.16b, v1.4b[1]\n"
- "prfm pldl1keep, [%x[Apanel], #0x80]\n"
- ".inst 0x6f81e85a // udot v26.4s, v2.16b, v1.4b[2]\n"
- ".inst 0x6fa1e85d // udot v29.4s, v2.16b, v1.4b[3]\n"
- "ldr q2, [x20, #0x30]\n"
- ".inst 0x6f80e069 // udot v9.4s, v3.16b, v0.4b[0]\n"
- ".inst 0x6fa0e06c // udot v12.4s, v3.16b, v0.4b[1]\n"
- "prfm pldl1keep, [x20, #0x100]\n"
- ".inst 0x6f80e86f // udot v15.4s, v3.16b, v0.4b[2]\n"
- ".inst 0x6fa0e872 // udot v18.4s, v3.16b, v0.4b[3]\n"
- "prfm pldl1keep, [x20, #0x140]\n"
- ".inst 0x6f81e075 // udot v21.4s, v3.16b, v1.4b[0]\n"
- ".inst 0x6fa1e078 // udot v24.4s, v3.16b, v1.4b[1]\n"
- ".inst 0x6f81e87b // udot v27.4s, v3.16b, v1.4b[2]\n"
- ".inst 0x6fa1e87e // udot v30.4s, v3.16b, v1.4b[3]\n"
- "ldr q3, [x20, #0x40]\n"
- ".inst 0x6f80e08a // udot v10.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6fa0e08d // udot v13.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6f80e890 // udot v16.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x6fa0e893 // udot v19.4s, v4.16b, v0.4b[3]\n"
- "ldr q0, [%x[Apanel], #0x20]\n"
- ".inst 0x6f81e096 // udot v22.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6fa1e099 // udot v25.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6f81e89c // udot v28.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n"
- "ldr q1, [%x[Apanel], #0x30]\n"
- "ldr q4, [x20, #0x50]\n"
- "add %x[Apanel], %x[Apanel], #0x40\n"
- "add x20, x20, #0x60\n"
- ".inst 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n"
- ".inst 0x6fa0e04b // udot v11.4s, v2.16b, v0.4b[1]\n"
- ".inst 0x6f80e84e // udot v14.4s, v2.16b, v0.4b[2]\n"
- ".inst 0x6fa0e851 // udot v17.4s, v2.16b, v0.4b[3]\n"
- ".inst 0x6f81e054 // udot v20.4s, v2.16b, v1.4b[0]\n"
- ".inst 0x6fa1e057 // udot v23.4s, v2.16b, v1.4b[1]\n"
- ".inst 0x6f81e85a // udot v26.4s, v2.16b, v1.4b[2]\n"
- ".inst 0x6fa1e85d // udot v29.4s, v2.16b, v1.4b[3]\n"
- "ldr q2, [x20, #0x0]\n"
- ".inst 0x6f80e069 // udot v9.4s, v3.16b, v0.4b[0]\n"
- ".inst 0x6fa0e06c // udot v12.4s, v3.16b, v0.4b[1]\n"
- ".inst 0x6f80e86f // udot v15.4s, v3.16b, v0.4b[2]\n"
- ".inst 0x6fa0e872 // udot v18.4s, v3.16b, v0.4b[3]\n"
- ".inst 0x6f81e075 // udot v21.4s, v3.16b, v1.4b[0]\n"
- ".inst 0x6fa1e078 // udot v24.4s, v3.16b, v1.4b[1]\n"
- ".inst 0x6f81e87b // udot v27.4s, v3.16b, v1.4b[2]\n"
- ".inst 0x6fa1e87e // udot v30.4s, v3.16b, v1.4b[3]\n"
- "ldr q3, [x20, #0x10]\n"
- ".inst 0x6f80e08a // udot v10.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6fa0e08d // udot v13.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6f80e890 // udot v16.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x6fa0e893 // udot v19.4s, v4.16b, v0.4b[3]\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- ".inst 0x6f81e096 // udot v22.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6fa1e099 // udot v25.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6f81e89c // udot v28.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "ldr q4, [x20, #0x20]\n"
- "bge 3b\n"
- "4:" // main loop skip
- "add %x[Apanel], %x[Apanel], #0x20\n"
- ".inst 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n"
- ".inst 0x6fa0e04b // udot v11.4s, v2.16b, v0.4b[1]\n"
- "add x20, x20, #0x30\n"
- ".inst 0x6f80e84e // udot v14.4s, v2.16b, v0.4b[2]\n"
- ".inst 0x6fa0e851 // udot v17.4s, v2.16b, v0.4b[3]\n"
- ".inst 0x6f81e054 // udot v20.4s, v2.16b, v1.4b[0]\n"
- ".inst 0x6fa1e057 // udot v23.4s, v2.16b, v1.4b[1]\n"
- ".inst 0x6f81e85a // udot v26.4s, v2.16b, v1.4b[2]\n"
- ".inst 0x6fa1e85d // udot v29.4s, v2.16b, v1.4b[3]\n"
- ".inst 0x6f80e069 // udot v9.4s, v3.16b, v0.4b[0]\n"
- ".inst 0x6fa0e06c // udot v12.4s, v3.16b, v0.4b[1]\n"
- ".inst 0x6f80e86f // udot v15.4s, v3.16b, v0.4b[2]\n"
- ".inst 0x6fa0e872 // udot v18.4s, v3.16b, v0.4b[3]\n"
- ".inst 0x6f81e075 // udot v21.4s, v3.16b, v1.4b[0]\n"
- ".inst 0x6fa1e078 // udot v24.4s, v3.16b, v1.4b[1]\n"
- ".inst 0x6f81e87b // udot v27.4s, v3.16b, v1.4b[2]\n"
- ".inst 0x6fa1e87e // udot v30.4s, v3.16b, v1.4b[3]\n"
- ".inst 0x6f80e08a // udot v10.4s, v4.16b, v0.4b[0]\n"
- ".inst 0x6fa0e08d // udot v13.4s, v4.16b, v0.4b[1]\n"
- ".inst 0x6f80e890 // udot v16.4s, v4.16b, v0.4b[2]\n"
- ".inst 0x6fa0e893 // udot v19.4s, v4.16b, v0.4b[3]\n"
- ".inst 0x6f81e096 // udot v22.4s, v4.16b, v1.4b[0]\n"
- ".inst 0x6fa1e099 // udot v25.4s, v4.16b, v1.4b[1]\n"
- ".inst 0x6f81e89c // udot v28.4s, v4.16b, v1.4b[2]\n"
- ".inst 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n"
- "cbz x19, 5f\n"
- "ldr q0, [%x[Apanel], #0x0]\n"
- "ldr q1, [%x[Apanel], #0x10]\n"
- "add %x[Apanel], %x[Apanel], #0x20\n"
- "ldr q5, [x20, #0x0]\n"
- "ldr q6, [x20, #0x10]\n"
- ".inst 0x6f80e0a8 // udot v8.4s, v5.16b, v0.4b[0]\n"
- "ldr q7, [x20, #0x20]\n"
- ".inst 0x6fa0e0ab // udot v11.4s, v5.16b, v0.4b[1]\n"
- ".inst 0x6f80e8ae // udot v14.4s, v5.16b, v0.4b[2]\n"
- "add x20, x20, #0x30\n"
- ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
- ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0b7 // udot v23.4s, v5.16b, v1.4b[1]\n"
- ".inst 0x6f81e8ba // udot v26.4s, v5.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8bd // udot v29.4s, v5.16b, v1.4b[3]\n"
- ".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0cc // udot v12.4s, v6.16b, v0.4b[1]\n"
- ".inst 0x6f80e8cf // udot v15.4s, v6.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
- ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0d8 // udot v24.4s, v6.16b, v1.4b[1]\n"
- ".inst 0x6f81e8db // udot v27.4s, v6.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8de // udot v30.4s, v6.16b, v1.4b[3]\n"
- ".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n"
- ".inst 0x6fa0e0ed // udot v13.4s, v7.16b, v0.4b[1]\n"
- ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
- ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
- ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
- ".inst 0x6fa1e0f9 // udot v25.4s, v7.16b, v1.4b[1]\n"
- ".inst 0x6f81e8fc // udot v28.4s, v7.16b, v1.4b[2]\n"
- ".inst 0x6fa1e8ff // udot v31.4s, v7.16b, v1.4b[3]\n"
- "5:" // multiply loop done
- "subs x22, x22, #0x1\n"
- "str q8, [%x[Cpanel], #0x0]\n"
- "str q9, [%x[Cpanel], #0x10]\n"
- "str q10, [%x[Cpanel], #0x20]\n"
- "str q11, [%x[Cpanel], #0x30]\n"
- "str q12, [%x[Cpanel], #0x40]\n"
- "str q13, [%x[Cpanel], #0x50]\n"
- "str q14, [%x[Cpanel], #0x60]\n"
- "str q15, [%x[Cpanel], #0x70]\n"
- "str q16, [%x[Cpanel], #0x80]\n"
- "str q17, [%x[Cpanel], #0x90]\n"
- "str q18, [%x[Cpanel], #0xa0]\n"
- "str q19, [%x[Cpanel], #0xb0]\n"
- "str q20, [%x[Cpanel], #0xc0]\n"
- "str q21, [%x[Cpanel], #0xd0]\n"
- "str q22, [%x[Cpanel], #0xe0]\n"
- "str q23, [%x[Cpanel], #0xf0]\n"
- "str q24, [%x[Cpanel], #0x100]\n"
- "str q25, [%x[Cpanel], #0x110]\n"
- "str q26, [%x[Cpanel], #0x120]\n"
- "str q27, [%x[Cpanel], #0x130]\n"
- "str q28, [%x[Cpanel], #0x140]\n"
- "str q29, [%x[Cpanel], #0x150]\n"
- "str q30, [%x[Cpanel], #0x160]\n"
- "str q31, [%x[Cpanel], #0x170]\n"
- "add %x[Cpanel], %x[Cpanel], #0x180\n"
- "bgt 2b\n"
- "subs %x[ablocks], %x[ablocks], #0x1\n"
- "bne 1b\n"
- : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
- : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
- );
-}
-
-} // namespace arm_gemm
-#endif // __aarch64__