From 97426a707467a2e025a669fc5b36cc6f6274c23a Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 27 Jul 2021 19:04:51 +0100 Subject: Remove generated kernels that overlap hand-written ones Generated kernels are not used at the moment. Signed-off-by: Georgios Pinitas Change-Id: I3ba767a53f78e4409c70a850c8051f6ee7453358 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6008 Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- Android.bp | 12 - filelist.json | 12 - .../arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp | 2 + .../arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp | 2 + .../kernels/a64_interleaved_fp16_mla_8x24.hpp | 110 ------- .../kernels/a64_interleaved_fp16_mla_8x24/a55.cpp | 263 --------------- .../a64_interleaved_fp16_mla_8x24/generic.cpp | 247 -------------- .../kernels/a64_interleaved_fp16_mla_8x24/x1.cpp | 247 -------------- .../kernels/a64_interleaved_fp32_mla_8x12.hpp | 115 ------- .../kernels/a64_interleaved_fp32_mla_8x12/a55.cpp | 360 --------------------- .../a64_interleaved_fp32_mla_8x12/generic.cpp | 320 ------------------ .../kernels/a64_interleaved_fp32_mla_8x12/x1.cpp | 320 ------------------ .../kernels/a64_interleaved_s8s32_dot_8x12.hpp | 110 ------- .../kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp | 273 ---------------- .../a64_interleaved_s8s32_dot_8x12/generic.cpp | 253 --------------- .../kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp | 253 --------------- .../kernels/a64_interleaved_u8u32_dot_8x12.hpp | 110 ------- .../kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp | 273 ---------------- .../a64_interleaved_u8u32_dot_8x12/generic.cpp | 253 --------------- .../kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp | 253 --------------- 20 files changed, 4 insertions(+), 3784 deletions(-) delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp delete mode 100644 src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp diff --git a/Android.bp b/Android.bp index 0502e841f1..09383551d5 100644 --- a/Android.bp +++ b/Android.bp @@ -877,19 +877,7 @@ cc_library_static { "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp", diff --git a/filelist.json b/filelist.json index 394ec0441a..4c7c81c63e 100644 --- a/filelist.json +++ b/filelist.json @@ -1287,19 +1287,7 @@ "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp", diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp index d91c69b8a0..48ce67613e 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp @@ -79,6 +79,8 @@ public: switch (ci->get_cpu_model()) { default: return { 31.65 }; + case CPUModel::A55r1: + return { 9.217 }; case CPUModel::A510: return { 15.87 }; case CPUModel::V1: diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp index 3a77397632..c5105a6d4a 100644 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp +++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp @@ -92,6 +92,8 @@ public: switch (ci->get_cpu_model()) { default: return { 31.63 }; + case CPUModel::A55r1: + return { 9.217 }; case CPUModel::A510: return { 15.89 }; case CPUModel::V1: diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp deleted file mode 100644 index ce63600424..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ -#include "../std_transforms_fixed.hpp" -#include "../performance_parameters.hpp" - -#define ARGLIST \ - const __fp16 *, const __fp16 *, \ - __fp16 *, int, int, int - -namespace arm_gemm -{ -// Actual kernel implementations -void a64_interleaved_fp16_mla_8x24( ARGLIST ); -void a64_interleaved_fp16_mla_8x24_a55( ARGLIST ); -void a64_interleaved_fp16_mla_8x24_x1( ARGLIST ); - -class cls_a64_interleaved_fp16_mla_8x24 -{ -public: - typedef __fp16 operand_type; - typedef __fp16 result_type; - - typedef void (*kern_type)( ARGLIST ); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return 24; - } - - static unsigned int stripe_width() - { - return 8; - } - - static constexpr unsigned int k_unroll() - { - return 1; - } - - - StdTransformsFixed transforms = {}; - StdTransformsFixed transforms_quantized = {}; - template - static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) - { - - if (std::is_same::value) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 7.16, 1.14, 0.67 }; - default: - return { 12.67, 3.98, 1.16 }; - } - } - - return { 1.0 }; - } - - // Default to the generic kernel - kern_type kernel=a64_interleaved_fp16_mla_8x24; - cls_a64_interleaved_fp16_mla_8x24(const CPUInfo *ci) - { - switch(ci->get_cpu_model()) { - default: - break; - case CPUModel::A55r1: - kernel=a64_interleaved_fp16_mla_8x24_a55; - break; - case CPUModel::X1: - kernel=a64_interleaved_fp16_mla_8x24_x1; - break; - } - } -}; - -} // namespace arm_gemm - -#undef ARGLIST - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp deleted file mode 100644 index 49500f2d18..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) - -#include - -namespace arm_gemm { - -void a64_interleaved_fp16_mla_8x24_a55( - const __fp16 *Apanel, const __fp16 *Bpanel, - __fp16 *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const __fp16 *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/1) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x10, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x9, %x[Apanel]\n" - "ldr x28, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x27, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x9\n" - "cmp x27, #0x2\n" - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.16b, #0x0\n" - "prfm pldl1keep, [x28, #0x0]\n" - "movi v11.16b, #0x0\n" - "prfm pldl1keep, [x28, #0x40]\n" - "movi v12.16b, #0x0\n" - "prfm pldl1keep, [x28, #0x80]\n" - "movi v13.16b, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v14.16b, #0x0\n" - "ldr q2, [x28, #0x0]\n" - "movi v15.16b, #0x0\n" - "ldr q3, [x28, #0x10]\n" - "movi v16.16b, #0x0\n" - "ldr q4, [x28, #0x20]\n" - "movi v17.16b, #0x0\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "blt 4f\n" - "3:" // main loop head - "ldr d1, [%x[Apanel], #0x10]\n" - "fmla v8.8h, v2.8h, v0.h[0]\n" - "ldr x26, [%x[Apanel], #0x18]\n" - "fmla v11.8h, v2.8h, v0.h[1]\n" - "ldr d5, [x28, #0x30]\n" - "fmla v14.8h, v2.8h, v0.h[2]\n" - "ldr x25, [x28, #0x38]\n" - "fmla v17.8h, v2.8h, v0.h[3]\n" - "ldr d6, [x28, #0x40]\n" - "fmla v20.8h, v2.8h, v0.h[4]\n" - "ldr x24, [x28, #0x48]\n" - "fmla v23.8h, v2.8h, v0.h[5]\n" - "ldr d7, [x28, #0x50]\n" - "fmla v26.8h, v2.8h, v0.h[6]\n" - "ldr x23, [x28, #0x58]\n" - "fmla v29.8h, v2.8h, v0.h[7]\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla v9.8h, v3.8h, v0.h[0]\n" - "prfm pldl1keep, [x28, #0x100]\n" - "fmla v12.8h, v3.8h, v0.h[1]\n" - "prfm pldl1keep, [x28, #0x140]\n" - "fmla v15.8h, v3.8h, v0.h[2]\n" - "add x28, x28, #0x60\n" - "fmla v18.8h, v3.8h, v0.h[3]\n" - "ldr d2, [x28, #0x0]\n" - "fmla v21.8h, v3.8h, v0.h[4]\n" - "ldr x22, [x28, #0x8]\n" - "fmla v24.8h, v3.8h, v0.h[5]\n" - "ldr x21, [x28, #0x18]\n" - "fmla v27.8h, v3.8h, v0.h[6]\n" - "ldr x20, [%x[Apanel], #0x8]\n" - "fmla v30.8h, v3.8h, v0.h[7]\n" - "ldr d3, [x28, #0x10]\n" - "fmla v10.8h, v4.8h, v0.h[0]\n" - "ldr x19, [x28, #0x28]\n" - "fmla v13.8h, v4.8h, v0.h[1]\n" - "mov v1.d[1], x26\n" - "fmla v16.8h, v4.8h, v0.h[2]\n" - "mov v5.d[1], x25\n" - "fmla v19.8h, v4.8h, v0.h[3]\n" - "mov v6.d[1], x24\n" - "fmla v22.8h, v4.8h, v0.h[4]\n" - "mov v7.d[1], x23\n" - "fmla v25.8h, v4.8h, v0.h[5]\n" - "sub x27, x27, #0x2\n" - "fmla v28.8h, v4.8h, v0.h[6]\n" - "cmp x27, #0x2\n" - "fmla v31.8h, v4.8h, v0.h[7]\n" - "ldr d0, [%x[Apanel], #0x0]\n" - "ldr d4, [x28, #0x20]\n" - "mov v2.d[1], x22\n" - "mov v3.d[1], x21\n" - "fmla v8.8h, v5.8h, v1.h[0]\n" - "mov v0.d[1], x20\n" - "fmla v11.8h, v5.8h, v1.h[1]\n" - "mov v4.d[1], x19\n" - "fmla v14.8h, v5.8h, v1.h[2]\n" - "fmla v17.8h, v5.8h, v1.h[3]\n" - "fmla v20.8h, v5.8h, v1.h[4]\n" - "fmla v23.8h, v5.8h, v1.h[5]\n" - "fmla v26.8h, v5.8h, v1.h[6]\n" - "fmla v29.8h, v5.8h, v1.h[7]\n" - "fmla v9.8h, v6.8h, v1.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v15.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v1.h[3]\n" - "fmla v21.8h, v6.8h, v1.h[4]\n" - "fmla v24.8h, v6.8h, v1.h[5]\n" - "fmla v27.8h, v6.8h, v1.h[6]\n" - "fmla v30.8h, v6.8h, v1.h[7]\n" - "fmla v10.8h, v7.8h, v1.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v16.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v1.h[3]\n" - "fmla v22.8h, v7.8h, v1.h[4]\n" - "fmla v25.8h, v7.8h, v1.h[5]\n" - "fmla v28.8h, v7.8h, v1.h[6]\n" - "fmla v31.8h, v7.8h, v1.h[7]\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla v8.8h, v2.8h, v0.h[0]\n" - "add x28, x28, #0x30\n" - "fmla v11.8h, v2.8h, v0.h[1]\n" - "fmla v14.8h, v2.8h, v0.h[2]\n" - "fmla v17.8h, v2.8h, v0.h[3]\n" - "fmla v20.8h, v2.8h, v0.h[4]\n" - "fmla v23.8h, v2.8h, v0.h[5]\n" - "fmla v26.8h, v2.8h, v0.h[6]\n" - "fmla v29.8h, v2.8h, v0.h[7]\n" - "fmla v9.8h, v3.8h, v0.h[0]\n" - "fmla v12.8h, v3.8h, v0.h[1]\n" - "fmla v15.8h, v3.8h, v0.h[2]\n" - "fmla v18.8h, v3.8h, v0.h[3]\n" - "fmla v21.8h, v3.8h, v0.h[4]\n" - "fmla v24.8h, v3.8h, v0.h[5]\n" - "fmla v27.8h, v3.8h, v0.h[6]\n" - "fmla v30.8h, v3.8h, v0.h[7]\n" - "fmla v10.8h, v4.8h, v0.h[0]\n" - "fmla v13.8h, v4.8h, v0.h[1]\n" - "fmla v16.8h, v4.8h, v0.h[2]\n" - "fmla v19.8h, v4.8h, v0.h[3]\n" - "fmla v22.8h, v4.8h, v0.h[4]\n" - "fmla v25.8h, v4.8h, v0.h[5]\n" - "fmla v28.8h, v4.8h, v0.h[6]\n" - "fmla v31.8h, v4.8h, v0.h[7]\n" - "cbz x27, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "add %x[Apanel], %x[Apanel], #0x10\n" - "ldr q5, [x28, #0x0]\n" - "fmla v8.8h, v5.8h, v0.h[0]\n" - "ldr q6, [x28, #0x10]\n" - "fmla v11.8h, v5.8h, v0.h[1]\n" - "ldr q7, [x28, #0x20]\n" - "fmla v14.8h, v5.8h, v0.h[2]\n" - "fmla v17.8h, v5.8h, v0.h[3]\n" - "add x28, x28, #0x30\n" - "fmla v20.8h, v5.8h, v0.h[4]\n" - "fmla v23.8h, v5.8h, v0.h[5]\n" - "fmla v26.8h, v5.8h, v0.h[6]\n" - "fmla v29.8h, v5.8h, v0.h[7]\n" - "fmla v9.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v0.h[1]\n" - "fmla v15.8h, v6.8h, v0.h[2]\n" - "fmla v18.8h, v6.8h, v0.h[3]\n" - "fmla v21.8h, v6.8h, v0.h[4]\n" - "fmla v24.8h, v6.8h, v0.h[5]\n" - "fmla v27.8h, v6.8h, v0.h[6]\n" - "fmla v30.8h, v6.8h, v0.h[7]\n" - "fmla v10.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v0.h[1]\n" - "fmla v16.8h, v7.8h, v0.h[2]\n" - "fmla v19.8h, v7.8h, v0.h[3]\n" - "fmla v22.8h, v7.8h, v0.h[4]\n" - "fmla v25.8h, v7.8h, v0.h[5]\n" - "fmla v28.8h, v7.8h, v0.h[6]\n" - "fmla v31.8h, v7.8h, v0.h[7]\n" - "5:" // multiply loop done - "subs x10, x10, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp deleted file mode 100644 index a9da6956ed..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) - -#include - -namespace arm_gemm { - -void a64_interleaved_fp16_mla_8x24( - const __fp16 *Apanel, const __fp16 *Bpanel, - __fp16 *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const __fp16 *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/1) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x21, %x[Apanel]\n" - "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x21\n" - "cmp x19, #0x2\n" - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x0]\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x40]\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x80]\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "ldr q2, [x20, #0x0]\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "ldr q3, [x20, #0x10]\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "ldr q4, [x20, #0x20]\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "blt 4f\n" - "3:" // main loop head - "fmla v8.8h, v2.8h, v0.h[0]\n" - "fmla v11.8h, v2.8h, v0.h[1]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "fmla v14.8h, v2.8h, v0.h[2]\n" - "fmla v17.8h, v2.8h, v0.h[3]\n" - "ldr q5, [x20, #0x30]\n" - "fmla v20.8h, v2.8h, v0.h[4]\n" - "fmla v23.8h, v2.8h, v0.h[5]\n" - "ldr q6, [x20, #0x40]\n" - "fmla v26.8h, v2.8h, v0.h[6]\n" - "fmla v29.8h, v2.8h, v0.h[7]\n" - "ldr q7, [x20, #0x50]\n" - "fmla v9.8h, v3.8h, v0.h[0]\n" - "fmla v12.8h, v3.8h, v0.h[1]\n" - "sub x19, x19, #0x2\n" - "fmla v15.8h, v3.8h, v0.h[2]\n" - "fmla v18.8h, v3.8h, v0.h[3]\n" - "cmp x19, #0x2\n" - "fmla v21.8h, v3.8h, v0.h[4]\n" - "fmla v24.8h, v3.8h, v0.h[5]\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - "fmla v27.8h, v3.8h, v0.h[6]\n" - "fmla v30.8h, v3.8h, v0.h[7]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla v10.8h, v4.8h, v0.h[0]\n" - "fmla v13.8h, v4.8h, v0.h[1]\n" - "prfm pldl1keep, [x20, #0x100]\n" - "fmla v16.8h, v4.8h, v0.h[2]\n" - "fmla v19.8h, v4.8h, v0.h[3]\n" - "prfm pldl1keep, [x20, #0x140]\n" - "fmla v22.8h, v4.8h, v0.h[4]\n" - "fmla v25.8h, v4.8h, v0.h[5]\n" - "add x20, x20, #0x60\n" - "fmla v28.8h, v4.8h, v0.h[6]\n" - "fmla v31.8h, v4.8h, v0.h[7]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "fmla v8.8h, v5.8h, v1.h[0]\n" - "fmla v11.8h, v5.8h, v1.h[1]\n" - "ldr q2, [x20, #0x0]\n" - "fmla v14.8h, v5.8h, v1.h[2]\n" - "fmla v17.8h, v5.8h, v1.h[3]\n" - "ldr q3, [x20, #0x10]\n" - "fmla v20.8h, v5.8h, v1.h[4]\n" - "fmla v23.8h, v5.8h, v1.h[5]\n" - "ldr q4, [x20, #0x20]\n" - "fmla v26.8h, v5.8h, v1.h[6]\n" - "fmla v29.8h, v5.8h, v1.h[7]\n" - "fmla v9.8h, v6.8h, v1.h[0]\n" - "fmla v12.8h, v6.8h, v1.h[1]\n" - "fmla v15.8h, v6.8h, v1.h[2]\n" - "fmla v18.8h, v6.8h, v1.h[3]\n" - "fmla v21.8h, v6.8h, v1.h[4]\n" - "fmla v24.8h, v6.8h, v1.h[5]\n" - "fmla v27.8h, v6.8h, v1.h[6]\n" - "fmla v30.8h, v6.8h, v1.h[7]\n" - "fmla v10.8h, v7.8h, v1.h[0]\n" - "fmla v13.8h, v7.8h, v1.h[1]\n" - "fmla v16.8h, v7.8h, v1.h[2]\n" - "fmla v19.8h, v7.8h, v1.h[3]\n" - "fmla v22.8h, v7.8h, v1.h[4]\n" - "fmla v25.8h, v7.8h, v1.h[5]\n" - "fmla v28.8h, v7.8h, v1.h[6]\n" - "fmla v31.8h, v7.8h, v1.h[7]\n" - "bge 3b\n" - "4:" // main loop skip - "fmla v8.8h, v2.8h, v0.h[0]\n" - "fmla v11.8h, v2.8h, v0.h[1]\n" - "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla v14.8h, v2.8h, v0.h[2]\n" - "fmla v17.8h, v2.8h, v0.h[3]\n" - "add x20, x20, #0x30\n" - "fmla v20.8h, v2.8h, v0.h[4]\n" - "fmla v23.8h, v2.8h, v0.h[5]\n" - "fmla v26.8h, v2.8h, v0.h[6]\n" - "fmla v29.8h, v2.8h, v0.h[7]\n" - "fmla v9.8h, v3.8h, v0.h[0]\n" - "fmla v12.8h, v3.8h, v0.h[1]\n" - "fmla v15.8h, v3.8h, v0.h[2]\n" - "fmla v18.8h, v3.8h, v0.h[3]\n" - "fmla v21.8h, v3.8h, v0.h[4]\n" - "fmla v24.8h, v3.8h, v0.h[5]\n" - "fmla v27.8h, v3.8h, v0.h[6]\n" - "fmla v30.8h, v3.8h, v0.h[7]\n" - "fmla v10.8h, v4.8h, v0.h[0]\n" - "fmla v13.8h, v4.8h, v0.h[1]\n" - "fmla v16.8h, v4.8h, v0.h[2]\n" - "fmla v19.8h, v4.8h, v0.h[3]\n" - "fmla v22.8h, v4.8h, v0.h[4]\n" - "fmla v25.8h, v4.8h, v0.h[5]\n" - "fmla v28.8h, v4.8h, v0.h[6]\n" - "fmla v31.8h, v4.8h, v0.h[7]\n" - "cbz x19, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q5, [x20, #0x0]\n" - "fmla v8.8h, v5.8h, v0.h[0]\n" - "ldr q6, [x20, #0x10]\n" - "ldr q7, [x20, #0x20]\n" - "fmla v11.8h, v5.8h, v0.h[1]\n" - "fmla v14.8h, v5.8h, v0.h[2]\n" - "fmla v17.8h, v5.8h, v0.h[3]\n" - "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla v20.8h, v5.8h, v0.h[4]\n" - "fmla v23.8h, v5.8h, v0.h[5]\n" - "add x20, x20, #0x30\n" - "fmla v26.8h, v5.8h, v0.h[6]\n" - "fmla v29.8h, v5.8h, v0.h[7]\n" - "fmla v9.8h, v6.8h, v0.h[0]\n" - "fmla v12.8h, v6.8h, v0.h[1]\n" - "fmla v15.8h, v6.8h, v0.h[2]\n" - "fmla v18.8h, v6.8h, v0.h[3]\n" - "fmla v21.8h, v6.8h, v0.h[4]\n" - "fmla v24.8h, v6.8h, v0.h[5]\n" - "fmla v27.8h, v6.8h, v0.h[6]\n" - "fmla v30.8h, v6.8h, v0.h[7]\n" - "fmla v10.8h, v7.8h, v0.h[0]\n" - "fmla v13.8h, v7.8h, v0.h[1]\n" - "fmla v16.8h, v7.8h, v0.h[2]\n" - "fmla v19.8h, v7.8h, v0.h[3]\n" - "fmla v22.8h, v7.8h, v0.h[4]\n" - "fmla v25.8h, v7.8h, v0.h[5]\n" - "fmla v28.8h, v7.8h, v0.h[6]\n" - "fmla v31.8h, v7.8h, v0.h[7]\n" - "5:" // multiply loop done - "subs x22, x22, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp deleted file mode 100644 index efaedeb33f..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)) - -#include - -namespace arm_gemm { - -void a64_interleaved_fp16_mla_8x24_x1( - const __fp16 *Apanel, const __fp16 *Bpanel, - __fp16 *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const __fp16 *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/1) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x21, %x[Apanel]\n" - "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x21\n" - "cmp x19, #0x2\n" - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x0]\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x40]\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x80]\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "ldr q1, [x20, #0x0]\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "ldr q2, [x20, #0x10]\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "ldr q3, [x20, #0x20]\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "blt 4f\n" - "3:" // main loop head - "fmla v8.8h, v1.8h, v0.h[0]\n" - "fmla v11.8h, v1.8h, v0.h[1]\n" - "sub x19, x19, #0x2\n" - "fmla v14.8h, v1.8h, v0.h[2]\n" - "fmla v17.8h, v1.8h, v0.h[3]\n" - "cmp x19, #0x2\n" - "fmla v20.8h, v1.8h, v0.h[4]\n" - "fmla v23.8h, v1.8h, v0.h[5]\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - "fmla v26.8h, v1.8h, v0.h[6]\n" - "fmla v29.8h, v1.8h, v0.h[7]\n" - "ldr q1, [x20, #0x30]\n" - "fmla v9.8h, v2.8h, v0.h[0]\n" - "fmla v12.8h, v2.8h, v0.h[1]\n" - "prfm pldl1keep, [x20, #0x100]\n" - "fmla v15.8h, v2.8h, v0.h[2]\n" - "fmla v18.8h, v2.8h, v0.h[3]\n" - "prfm pldl1keep, [x20, #0x140]\n" - "fmla v21.8h, v2.8h, v0.h[4]\n" - "fmla v24.8h, v2.8h, v0.h[5]\n" - "fmla v27.8h, v2.8h, v0.h[6]\n" - "fmla v30.8h, v2.8h, v0.h[7]\n" - "ldr q2, [x20, #0x40]\n" - "fmla v10.8h, v3.8h, v0.h[0]\n" - "fmla v13.8h, v3.8h, v0.h[1]\n" - "fmla v16.8h, v3.8h, v0.h[2]\n" - "fmla v19.8h, v3.8h, v0.h[3]\n" - "fmla v22.8h, v3.8h, v0.h[4]\n" - "fmla v25.8h, v3.8h, v0.h[5]\n" - "fmla v28.8h, v3.8h, v0.h[6]\n" - "fmla v31.8h, v3.8h, v0.h[7]\n" - "ldr q0, [%x[Apanel], #0x10]\n" - "ldr q3, [x20, #0x50]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "add x20, x20, #0x60\n" - "fmla v8.8h, v1.8h, v0.h[0]\n" - "fmla v11.8h, v1.8h, v0.h[1]\n" - "fmla v14.8h, v1.8h, v0.h[2]\n" - "fmla v17.8h, v1.8h, v0.h[3]\n" - "fmla v20.8h, v1.8h, v0.h[4]\n" - "fmla v23.8h, v1.8h, v0.h[5]\n" - "fmla v26.8h, v1.8h, v0.h[6]\n" - "fmla v29.8h, v1.8h, v0.h[7]\n" - "ldr q1, [x20, #0x0]\n" - "fmla v9.8h, v2.8h, v0.h[0]\n" - "fmla v12.8h, v2.8h, v0.h[1]\n" - "fmla v15.8h, v2.8h, v0.h[2]\n" - "fmla v18.8h, v2.8h, v0.h[3]\n" - "fmla v21.8h, v2.8h, v0.h[4]\n" - "fmla v24.8h, v2.8h, v0.h[5]\n" - "fmla v27.8h, v2.8h, v0.h[6]\n" - "fmla v30.8h, v2.8h, v0.h[7]\n" - "ldr q2, [x20, #0x10]\n" - "fmla v10.8h, v3.8h, v0.h[0]\n" - "fmla v13.8h, v3.8h, v0.h[1]\n" - "fmla v16.8h, v3.8h, v0.h[2]\n" - "fmla v19.8h, v3.8h, v0.h[3]\n" - "fmla v22.8h, v3.8h, v0.h[4]\n" - "fmla v25.8h, v3.8h, v0.h[5]\n" - "fmla v28.8h, v3.8h, v0.h[6]\n" - "fmla v31.8h, v3.8h, v0.h[7]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q3, [x20, #0x20]\n" - "bge 3b\n" - "4:" // main loop skip - "fmla v8.8h, v1.8h, v0.h[0]\n" - "fmla v11.8h, v1.8h, v0.h[1]\n" - "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla v14.8h, v1.8h, v0.h[2]\n" - "fmla v17.8h, v1.8h, v0.h[3]\n" - "add x20, x20, #0x30\n" - "fmla v20.8h, v1.8h, v0.h[4]\n" - "fmla v23.8h, v1.8h, v0.h[5]\n" - "fmla v26.8h, v1.8h, v0.h[6]\n" - "fmla v29.8h, v1.8h, v0.h[7]\n" - "fmla v9.8h, v2.8h, v0.h[0]\n" - "fmla v12.8h, v2.8h, v0.h[1]\n" - "fmla v15.8h, v2.8h, v0.h[2]\n" - "fmla v18.8h, v2.8h, v0.h[3]\n" - "fmla v21.8h, v2.8h, v0.h[4]\n" - "fmla v24.8h, v2.8h, v0.h[5]\n" - "fmla v27.8h, v2.8h, v0.h[6]\n" - "fmla v30.8h, v2.8h, v0.h[7]\n" - "fmla v10.8h, v3.8h, v0.h[0]\n" - "fmla v13.8h, v3.8h, v0.h[1]\n" - "fmla v16.8h, v3.8h, v0.h[2]\n" - "fmla v19.8h, v3.8h, v0.h[3]\n" - "fmla v22.8h, v3.8h, v0.h[4]\n" - "fmla v25.8h, v3.8h, v0.h[5]\n" - "fmla v28.8h, v3.8h, v0.h[6]\n" - "fmla v31.8h, v3.8h, v0.h[7]\n" - "cbz x19, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q4, [x20, #0x0]\n" - "fmla v8.8h, v4.8h, v0.h[0]\n" - "ldr q5, [x20, #0x10]\n" - "ldr q6, [x20, #0x20]\n" - "fmla v11.8h, v4.8h, v0.h[1]\n" - "fmla v14.8h, v4.8h, v0.h[2]\n" - "fmla v17.8h, v4.8h, v0.h[3]\n" - "add %x[Apanel], %x[Apanel], #0x10\n" - "fmla v20.8h, v4.8h, v0.h[4]\n" - "fmla v23.8h, v4.8h, v0.h[5]\n" - "add x20, x20, #0x30\n" - "fmla v26.8h, v4.8h, v0.h[6]\n" - "fmla v29.8h, v4.8h, v0.h[7]\n" - "fmla v9.8h, v5.8h, v0.h[0]\n" - "fmla v12.8h, v5.8h, v0.h[1]\n" - "fmla v15.8h, v5.8h, v0.h[2]\n" - "fmla v18.8h, v5.8h, v0.h[3]\n" - "fmla v21.8h, v5.8h, v0.h[4]\n" - "fmla v24.8h, v5.8h, v0.h[5]\n" - "fmla v27.8h, v5.8h, v0.h[6]\n" - "fmla v30.8h, v5.8h, v0.h[7]\n" - "fmla v10.8h, v6.8h, v0.h[0]\n" - "fmla v13.8h, v6.8h, v0.h[1]\n" - "fmla v16.8h, v6.8h, v0.h[2]\n" - "fmla v19.8h, v6.8h, v0.h[3]\n" - "fmla v22.8h, v6.8h, v0.h[4]\n" - "fmla v25.8h, v6.8h, v0.h[5]\n" - "fmla v28.8h, v6.8h, v0.h[6]\n" - "fmla v31.8h, v6.8h, v0.h[7]\n" - "5:" // multiply loop done - "subs x22, x22, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp deleted file mode 100644 index 465a5b4e0f..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ -#include "../std_transforms_fixed.hpp" -#include "../performance_parameters.hpp" - -#define ARGLIST \ - const float *, const float *, \ - float *, int, int, int - -namespace arm_gemm -{ -// Actual kernel implementations -void a64_interleaved_fp32_mla_8x12( ARGLIST ); -void a64_interleaved_fp32_mla_8x12_a55( ARGLIST ); -void a64_interleaved_fp32_mla_8x12_x1( ARGLIST ); - -class cls_a64_interleaved_fp32_mla_8x12 -{ -public: - typedef float operand_type; - typedef float result_type; - - typedef void (*kern_type)( ARGLIST ); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return 12; - } - - static unsigned int stripe_width() - { - return 4; - } - - static constexpr unsigned int k_unroll() - { - return 1; - } - - - StdTransformsFixed transforms = {}; - StdTransformsFixed transforms_quantized = {}; - template - static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) - { - - if (std::is_same::value) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 3.954, 1.252, 1.141 }; - default: - return { 7.2307, 3.876, 2.932 }; - case CPUModel::A73: - return { 2.885, 1.429, 1.163 }; - case CPUModel::A53: - return { 2.7, 0.9, 0.8 }; - } - } - - return { 1.0 }; - } - - // Default to the generic kernel - kern_type kernel=a64_interleaved_fp32_mla_8x12; - cls_a64_interleaved_fp32_mla_8x12(const CPUInfo *ci) - { - switch(ci->get_cpu_model()) { - default: - break; - case CPUModel::A55r1: - case CPUModel::A53: - kernel=a64_interleaved_fp32_mla_8x12_a55; - break; - case CPUModel::X1: - kernel=a64_interleaved_fp32_mla_8x12_x1; - break; - } - } -}; - -} // namespace arm_gemm - -#undef ARGLIST - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp deleted file mode 100644 index 46d9ff73b9..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -namespace arm_gemm { - -void a64_interleaved_fp32_mla_8x12_a55( - const float *Apanel, const float *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const float *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/1) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x28, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x27, %x[Apanel]\n" - "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x25, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x27\n" - "cmp x25, #0x4\n" - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.16b, #0x0\n" - "prfm pldl1keep, [x26, #0x0]\n" - "movi v11.16b, #0x0\n" - "prfm pldl1keep, [x26, #0x40]\n" - "movi v12.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x40]\n" - "movi v13.16b, #0x0\n" - "prfm pldl1keep, [x26, #0x80]\n" - "movi v14.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - "movi v15.16b, #0x0\n" - "prfm pldl1keep, [x26, #0xc0]\n" - "movi v16.16b, #0x0\n" - "prfm pldl1keep, [x26, #0x100]\n" - "movi v17.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0xc0]\n" - "movi v18.16b, #0x0\n" - "prfm pldl1keep, [x26, #0x140]\n" - "movi v19.16b, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v20.16b, #0x0\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "movi v21.16b, #0x0\n" - "ldr q4, [x26, #0x0]\n" - "movi v22.16b, #0x0\n" - "ldr q5, [x26, #0x10]\n" - "movi v23.16b, #0x0\n" - "ldr q6, [x26, #0x20]\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "blt 4f\n" - "3:" // main loop head - "ldr d2, [%x[Apanel], #0x20]\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "ldr x21, [%x[Apanel], #0x28]\n" - "fmla v11.4s, v4.4s, v0.s[1]\n" - "ldr d3, [%x[Apanel], #0x30]\n" - "fmla v14.4s, v4.4s, v0.s[2]\n" - "ldr x20, [%x[Apanel], #0x38]\n" - "fmla v17.4s, v4.4s, v0.s[3]\n" - "ldr d7, [x26, #0x30]\n" - "fmla v20.4s, v4.4s, v1.s[0]\n" - "ldr x24, [x26, #0x38]\n" - "fmla v23.4s, v4.4s, v1.s[1]\n" - "fmla v26.4s, v4.4s, v1.s[2]\n" - "ldr x23, [x26, #0x48]\n" - "fmla v29.4s, v4.4s, v1.s[3]\n" - "ldr d4, [x26, #0x40]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "mov v2.d[1], x21\n" - "fmla v12.4s, v5.4s, v0.s[1]\n" - "mov v3.d[1], x20\n" - "fmla v15.4s, v5.4s, v0.s[2]\n" - "mov v7.d[1], x24\n" - "fmla v18.4s, v5.4s, v0.s[3]\n" - "mov v4.d[1], x23\n" - "fmla v21.4s, v5.4s, v1.s[0]\n" - "ldr x22, [x26, #0x58]\n" - "fmla v24.4s, v5.4s, v1.s[1]\n" - "ldr x21, [%x[Apanel], #0x48]\n" - "fmla v27.4s, v5.4s, v1.s[2]\n" - "ldr x20, [%x[Apanel], #0x58]\n" - "fmla v30.4s, v5.4s, v1.s[3]\n" - "ldr d5, [x26, #0x50]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "ldr x19, [x26, #0x68]\n" - "fmla v13.4s, v6.4s, v0.s[1]\n" - "ldr x24, [x26, #0x78]\n" - "fmla v16.4s, v6.4s, v0.s[2]\n" - "mov v5.d[1], x22\n" - "fmla v19.4s, v6.4s, v0.s[3]\n" - "ldr d0, [%x[Apanel], #0x40]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "mov v0.d[1], x21\n" - "fmla v25.4s, v6.4s, v1.s[1]\n" - "ldr x23, [x26, #0x88]\n" - "fmla v28.4s, v6.4s, v1.s[2]\n" - "ldr x21, [%x[Apanel], #0x68]\n" - "fmla v31.4s, v6.4s, v1.s[3]\n" - "ldr d1, [%x[Apanel], #0x50]\n" - "ldr d6, [x26, #0x60]\n" - "fmla v8.4s, v7.4s, v2.s[0]\n" - "fmla v11.4s, v7.4s, v2.s[1]\n" - "mov v1.d[1], x20\n" - "fmla v14.4s, v7.4s, v2.s[2]\n" - "mov v6.d[1], x19\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "ldr x20, [%x[Apanel], #0x78]\n" - "fmla v20.4s, v7.4s, v3.s[0]\n" - "ldr x22, [x26, #0x98]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "ldr x19, [x26, #0xa8]\n" - "fmla v26.4s, v7.4s, v3.s[2]\n" - "prfm pldl1keep, [%x[Apanel], #0x100]\n" - "fmla v29.4s, v7.4s, v3.s[3]\n" - "ldr d7, [x26, #0x70]\n" - "fmla v9.4s, v4.4s, v2.s[0]\n" - "mov v7.d[1], x24\n" - "fmla v12.4s, v4.4s, v2.s[1]\n" - "ldr x24, [x26, #0xb8]\n" - "fmla v15.4s, v4.4s, v2.s[2]\n" - "prfm pldl1keep, [x26, #0x180]\n" - "fmla v18.4s, v4.4s, v2.s[3]\n" - "prfm pldl1keep, [x26, #0x1c0]\n" - "fmla v21.4s, v4.4s, v3.s[0]\n" - "prfm pldl1keep, [%x[Apanel], #0x140]\n" - "fmla v24.4s, v4.4s, v3.s[1]\n" - "prfm pldl1keep, [x26, #0x200]\n" - "fmla v27.4s, v4.4s, v3.s[2]\n" - "sub x25, x25, #0x4\n" - "fmla v30.4s, v4.4s, v3.s[3]\n" - "ldr d4, [x26, #0x80]\n" - "fmla v10.4s, v5.4s, v2.s[0]\n" - "mov v4.d[1], x23\n" - "fmla v13.4s, v5.4s, v2.s[1]\n" - "cmp x25, #0x4\n" - "fmla v16.4s, v5.4s, v2.s[2]\n" - "fmla v19.4s, v5.4s, v2.s[3]\n" - "ldr d2, [%x[Apanel], #0x60]\n" - "fmla v22.4s, v5.4s, v3.s[0]\n" - "mov v2.d[1], x21\n" - "fmla v25.4s, v5.4s, v3.s[1]\n" - "fmla v28.4s, v5.4s, v3.s[2]\n" - "fmla v31.4s, v5.4s, v3.s[3]\n" - "ldr d3, [%x[Apanel], #0x70]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "ldr d5, [x26, #0x90]\n" - "fmla v11.4s, v6.4s, v0.s[1]\n" - "mov v3.d[1], x20\n" - "fmla v14.4s, v6.4s, v0.s[2]\n" - "mov v5.d[1], x22\n" - "fmla v17.4s, v6.4s, v0.s[3]\n" - "add %x[Apanel], %x[Apanel], #0x80\n" - "fmla v20.4s, v6.4s, v1.s[0]\n" - "ldr x21, [%x[Apanel], #0x8]\n" - "fmla v23.4s, v6.4s, v1.s[1]\n" - "ldr x20, [%x[Apanel], #0x18]\n" - "fmla v26.4s, v6.4s, v1.s[2]\n" - "fmla v29.4s, v6.4s, v1.s[3]\n" - "ldr d6, [x26, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "mov v6.d[1], x19\n" - "fmla v12.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v0.s[2]\n" - "fmla v18.4s, v7.4s, v0.s[3]\n" - "fmla v21.4s, v7.4s, v1.s[0]\n" - "fmla v24.4s, v7.4s, v1.s[1]\n" - "fmla v27.4s, v7.4s, v1.s[2]\n" - "fmla v30.4s, v7.4s, v1.s[3]\n" - "ldr d7, [x26, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[0]\n" - "add x26, x26, #0xc0\n" - "fmla v13.4s, v4.4s, v0.s[1]\n" - "ldr x23, [x26, #0x8]\n" - "fmla v16.4s, v4.4s, v0.s[2]\n" - "ldr x22, [x26, #0x18]\n" - "fmla v19.4s, v4.4s, v0.s[3]\n" - "ldr d0, [%x[Apanel], #0x0]\n" - "fmla v22.4s, v4.4s, v1.s[0]\n" - "ldr x19, [x26, #0x28]\n" - "fmla v25.4s, v4.4s, v1.s[1]\n" - "mov v7.d[1], x24\n" - "fmla v28.4s, v4.4s, v1.s[2]\n" - "mov v0.d[1], x21\n" - "fmla v31.4s, v4.4s, v1.s[3]\n" - "ldr d1, [%x[Apanel], #0x10]\n" - "fmla v8.4s, v5.4s, v2.s[0]\n" - "ldr d4, [x26, #0x0]\n" - "fmla v11.4s, v5.4s, v2.s[1]\n" - "mov v1.d[1], x20\n" - "fmla v14.4s, v5.4s, v2.s[2]\n" - "mov v4.d[1], x23\n" - "fmla v17.4s, v5.4s, v2.s[3]\n" - "fmla v20.4s, v5.4s, v3.s[0]\n" - "fmla v23.4s, v5.4s, v3.s[1]\n" - "fmla v26.4s, v5.4s, v3.s[2]\n" - "fmla v29.4s, v5.4s, v3.s[3]\n" - "ldr d5, [x26, #0x10]\n" - "fmla v9.4s, v6.4s, v2.s[0]\n" - "mov v5.d[1], x22\n" - "fmla v12.4s, v6.4s, v2.s[1]\n" - "fmla v15.4s, v6.4s, v2.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v21.4s, v6.4s, v3.s[0]\n" - "fmla v24.4s, v6.4s, v3.s[1]\n" - "fmla v27.4s, v6.4s, v3.s[2]\n" - "fmla v30.4s, v6.4s, v3.s[3]\n" - "ldr d6, [x26, #0x20]\n" - "mov v6.d[1], x19\n" - "fmla v10.4s, v7.4s, v2.s[0]\n" - "fmla v13.4s, v7.4s, v2.s[1]\n" - "fmla v16.4s, v7.4s, v2.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "fmla v22.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v3.s[1]\n" - "fmla v28.4s, v7.4s, v3.s[2]\n" - "fmla v31.4s, v7.4s, v3.s[3]\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "add x26, x26, #0x30\n" - "fmla v11.4s, v4.4s, v0.s[1]\n" - "fmla v14.4s, v4.4s, v0.s[2]\n" - "fmla v17.4s, v4.4s, v0.s[3]\n" - "fmla v20.4s, v4.4s, v1.s[0]\n" - "fmla v23.4s, v4.4s, v1.s[1]\n" - "fmla v26.4s, v4.4s, v1.s[2]\n" - "fmla v29.4s, v4.4s, v1.s[3]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "fmla v12.4s, v5.4s, v0.s[1]\n" - "fmla v15.4s, v5.4s, v0.s[2]\n" - "fmla v18.4s, v5.4s, v0.s[3]\n" - "fmla v21.4s, v5.4s, v1.s[0]\n" - "fmla v24.4s, v5.4s, v1.s[1]\n" - "fmla v27.4s, v5.4s, v1.s[2]\n" - "fmla v30.4s, v5.4s, v1.s[3]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v13.4s, v6.4s, v0.s[1]\n" - "fmla v16.4s, v6.4s, v0.s[2]\n" - "fmla v19.4s, v6.4s, v0.s[3]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "fmla v25.4s, v6.4s, v1.s[1]\n" - "fmla v28.4s, v6.4s, v1.s[2]\n" - "fmla v31.4s, v6.4s, v1.s[3]\n" - "cbz x25, 6f\n" - "5:" // odd loop - "ldr q0, [%x[Apanel], #0x0]\n" - "subs x25, x25, #0x1\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "ldr q7, [x26, #0x0]\n" - "fmla v8.4s, v7.4s, v0.s[0]\n" - "ldr q4, [x26, #0x10]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "ldr q5, [x26, #0x20]\n" - "fmla v14.4s, v7.4s, v0.s[2]\n" - "fmla v17.4s, v7.4s, v0.s[3]\n" - "add x26, x26, #0x30\n" - "fmla v20.4s, v7.4s, v1.s[0]\n" - "fmla v23.4s, v7.4s, v1.s[1]\n" - "fmla v26.4s, v7.4s, v1.s[2]\n" - "fmla v29.4s, v7.4s, v1.s[3]\n" - "fmla v9.4s, v4.4s, v0.s[0]\n" - "fmla v12.4s, v4.4s, v0.s[1]\n" - "fmla v15.4s, v4.4s, v0.s[2]\n" - "fmla v18.4s, v4.4s, v0.s[3]\n" - "fmla v21.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v1.s[1]\n" - "fmla v27.4s, v4.4s, v1.s[2]\n" - "fmla v30.4s, v4.4s, v1.s[3]\n" - "fmla v10.4s, v5.4s, v0.s[0]\n" - "fmla v13.4s, v5.4s, v0.s[1]\n" - "fmla v16.4s, v5.4s, v0.s[2]\n" - "fmla v19.4s, v5.4s, v0.s[3]\n" - "fmla v22.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v1.s[1]\n" - "fmla v28.4s, v5.4s, v1.s[2]\n" - "fmla v31.4s, v5.4s, v1.s[3]\n" - "bne 5b\n" - "6:" // multiply loop done - "subs x28, x28, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp deleted file mode 100644 index 06dc1534c1..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -namespace arm_gemm { - -void a64_interleaved_fp32_mla_8x12( - const float *Apanel, const float *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const float *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/1) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x21, %x[Apanel]\n" - "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x21\n" - "cmp x19, #0x4\n" - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x0]\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x40]\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x40]\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x80]\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "prfm pldl1keep, [x20, #0xc0]\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x100]\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0xc0]\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x140]\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q4, [x20, #0x0]\n" - "ldr q5, [x20, #0x10]\n" - "ldr q6, [x20, #0x20]\n" - "blt 4f\n" - "3:" // main loop head - "fmla v8.4s, v4.4s, v0.s[0]\n" - "fmla v11.4s, v4.4s, v0.s[1]\n" - "ldr q2, [%x[Apanel], #0x20]\n" - "fmla v14.4s, v4.4s, v0.s[2]\n" - "fmla v17.4s, v4.4s, v0.s[3]\n" - "ldr q3, [%x[Apanel], #0x30]\n" - "fmla v20.4s, v4.4s, v1.s[0]\n" - "fmla v23.4s, v4.4s, v1.s[1]\n" - "ldr q7, [x20, #0x30]\n" - "fmla v26.4s, v4.4s, v1.s[2]\n" - "fmla v29.4s, v4.4s, v1.s[3]\n" - "ldr q4, [x20, #0x40]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "fmla v12.4s, v5.4s, v0.s[1]\n" - "sub x19, x19, #0x4\n" - "fmla v15.4s, v5.4s, v0.s[2]\n" - "fmla v18.4s, v5.4s, v0.s[3]\n" - "cmp x19, #0x4\n" - "fmla v21.4s, v5.4s, v1.s[0]\n" - "fmla v24.4s, v5.4s, v1.s[1]\n" - "prfm pldl1keep, [%x[Apanel], #0x100]\n" - "fmla v27.4s, v5.4s, v1.s[2]\n" - "fmla v30.4s, v5.4s, v1.s[3]\n" - "ldr q5, [x20, #0x50]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v13.4s, v6.4s, v0.s[1]\n" - "prfm pldl1keep, [x20, #0x180]\n" - "fmla v16.4s, v6.4s, v0.s[2]\n" - "fmla v19.4s, v6.4s, v0.s[3]\n" - "ldr q0, [%x[Apanel], #0x40]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "fmla v25.4s, v6.4s, v1.s[1]\n" - "prfm pldl1keep, [x20, #0x1c0]\n" - "fmla v28.4s, v6.4s, v1.s[2]\n" - "fmla v31.4s, v6.4s, v1.s[3]\n" - "ldr q1, [%x[Apanel], #0x50]\n" - "fmla v8.4s, v7.4s, v2.s[0]\n" - "fmla v11.4s, v7.4s, v2.s[1]\n" - "ldr q6, [x20, #0x60]\n" - "fmla v14.4s, v7.4s, v2.s[2]\n" - "fmla v17.4s, v7.4s, v2.s[3]\n" - "prfm pldl1keep, [%x[Apanel], #0x140]\n" - "fmla v20.4s, v7.4s, v3.s[0]\n" - "fmla v23.4s, v7.4s, v3.s[1]\n" - "prfm pldl1keep, [x20, #0x200]\n" - "fmla v26.4s, v7.4s, v3.s[2]\n" - "fmla v29.4s, v7.4s, v3.s[3]\n" - "ldr q7, [x20, #0x70]\n" - "fmla v9.4s, v4.4s, v2.s[0]\n" - "fmla v12.4s, v4.4s, v2.s[1]\n" - "fmla v15.4s, v4.4s, v2.s[2]\n" - "fmla v18.4s, v4.4s, v2.s[3]\n" - "fmla v21.4s, v4.4s, v3.s[0]\n" - "fmla v24.4s, v4.4s, v3.s[1]\n" - "fmla v27.4s, v4.4s, v3.s[2]\n" - "fmla v30.4s, v4.4s, v3.s[3]\n" - "ldr q4, [x20, #0x80]\n" - "fmla v10.4s, v5.4s, v2.s[0]\n" - "fmla v13.4s, v5.4s, v2.s[1]\n" - "fmla v16.4s, v5.4s, v2.s[2]\n" - "fmla v19.4s, v5.4s, v2.s[3]\n" - "ldr q2, [%x[Apanel], #0x60]\n" - "fmla v22.4s, v5.4s, v3.s[0]\n" - "fmla v25.4s, v5.4s, v3.s[1]\n" - "fmla v28.4s, v5.4s, v3.s[2]\n" - "fmla v31.4s, v5.4s, v3.s[3]\n" - "ldr q3, [%x[Apanel], #0x70]\n" - "fmla v8.4s, v6.4s, v0.s[0]\n" - "fmla v11.4s, v6.4s, v0.s[1]\n" - "ldr q5, [x20, #0x90]\n" - "fmla v14.4s, v6.4s, v0.s[2]\n" - "fmla v17.4s, v6.4s, v0.s[3]\n" - "add %x[Apanel], %x[Apanel], #0x80\n" - "fmla v20.4s, v6.4s, v1.s[0]\n" - "fmla v23.4s, v6.4s, v1.s[1]\n" - "fmla v26.4s, v6.4s, v1.s[2]\n" - "fmla v29.4s, v6.4s, v1.s[3]\n" - "ldr q6, [x20, #0xa0]\n" - "fmla v9.4s, v7.4s, v0.s[0]\n" - "fmla v12.4s, v7.4s, v0.s[1]\n" - "fmla v15.4s, v7.4s, v0.s[2]\n" - "fmla v18.4s, v7.4s, v0.s[3]\n" - "fmla v21.4s, v7.4s, v1.s[0]\n" - "fmla v24.4s, v7.4s, v1.s[1]\n" - "fmla v27.4s, v7.4s, v1.s[2]\n" - "fmla v30.4s, v7.4s, v1.s[3]\n" - "ldr q7, [x20, #0xb0]\n" - "fmla v10.4s, v4.4s, v0.s[0]\n" - "fmla v13.4s, v4.4s, v0.s[1]\n" - "add x20, x20, #0xc0\n" - "fmla v16.4s, v4.4s, v0.s[2]\n" - "fmla v19.4s, v4.4s, v0.s[3]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "fmla v22.4s, v4.4s, v1.s[0]\n" - "fmla v25.4s, v4.4s, v1.s[1]\n" - "fmla v28.4s, v4.4s, v1.s[2]\n" - "fmla v31.4s, v4.4s, v1.s[3]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "fmla v8.4s, v5.4s, v2.s[0]\n" - "fmla v11.4s, v5.4s, v2.s[1]\n" - "ldr q4, [x20, #0x0]\n" - "fmla v14.4s, v5.4s, v2.s[2]\n" - "fmla v17.4s, v5.4s, v2.s[3]\n" - "fmla v20.4s, v5.4s, v3.s[0]\n" - "fmla v23.4s, v5.4s, v3.s[1]\n" - "fmla v26.4s, v5.4s, v3.s[2]\n" - "fmla v29.4s, v5.4s, v3.s[3]\n" - "ldr q5, [x20, #0x10]\n" - "fmla v9.4s, v6.4s, v2.s[0]\n" - "fmla v12.4s, v6.4s, v2.s[1]\n" - "fmla v15.4s, v6.4s, v2.s[2]\n" - "fmla v18.4s, v6.4s, v2.s[3]\n" - "fmla v21.4s, v6.4s, v3.s[0]\n" - "fmla v24.4s, v6.4s, v3.s[1]\n" - "fmla v27.4s, v6.4s, v3.s[2]\n" - "fmla v30.4s, v6.4s, v3.s[3]\n" - "ldr q6, [x20, #0x20]\n" - "fmla v10.4s, v7.4s, v2.s[0]\n" - "fmla v13.4s, v7.4s, v2.s[1]\n" - "fmla v16.4s, v7.4s, v2.s[2]\n" - "fmla v19.4s, v7.4s, v2.s[3]\n" - "fmla v22.4s, v7.4s, v3.s[0]\n" - "fmla v25.4s, v7.4s, v3.s[1]\n" - "fmla v28.4s, v7.4s, v3.s[2]\n" - "fmla v31.4s, v7.4s, v3.s[3]\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla v8.4s, v4.4s, v0.s[0]\n" - "fmla v11.4s, v4.4s, v0.s[1]\n" - "add x20, x20, #0x30\n" - "fmla v14.4s, v4.4s, v0.s[2]\n" - "fmla v17.4s, v4.4s, v0.s[3]\n" - "fmla v20.4s, v4.4s, v1.s[0]\n" - "fmla v23.4s, v4.4s, v1.s[1]\n" - "fmla v26.4s, v4.4s, v1.s[2]\n" - "fmla v29.4s, v4.4s, v1.s[3]\n" - "fmla v9.4s, v5.4s, v0.s[0]\n" - "fmla v12.4s, v5.4s, v0.s[1]\n" - "fmla v15.4s, v5.4s, v0.s[2]\n" - "fmla v18.4s, v5.4s, v0.s[3]\n" - "fmla v21.4s, v5.4s, v1.s[0]\n" - "fmla v24.4s, v5.4s, v1.s[1]\n" - "fmla v27.4s, v5.4s, v1.s[2]\n" - "fmla v30.4s, v5.4s, v1.s[3]\n" - "fmla v10.4s, v6.4s, v0.s[0]\n" - "fmla v13.4s, v6.4s, v0.s[1]\n" - "fmla v16.4s, v6.4s, v0.s[2]\n" - "fmla v19.4s, v6.4s, v0.s[3]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "fmla v25.4s, v6.4s, v1.s[1]\n" - "fmla v28.4s, v6.4s, v1.s[2]\n" - "fmla v31.4s, v6.4s, v1.s[3]\n" - "cbz x19, 6f\n" - "5:" // odd loop - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "subs x19, x19, #0x1\n" - "ldr q7, [x20, #0x0]\n" - "ldr q4, [x20, #0x10]\n" - "fmla v8.4s, v7.4s, v0.s[0]\n" - "ldr q5, [x20, #0x20]\n" - "fmla v11.4s, v7.4s, v0.s[1]\n" - "fmla v14.4s, v7.4s, v0.s[2]\n" - "fmla v17.4s, v7.4s, v0.s[3]\n" - "fmla v20.4s, v7.4s, v1.s[0]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla v23.4s, v7.4s, v1.s[1]\n" - "fmla v26.4s, v7.4s, v1.s[2]\n" - "add x20, x20, #0x30\n" - "fmla v29.4s, v7.4s, v1.s[3]\n" - "fmla v9.4s, v4.4s, v0.s[0]\n" - "fmla v12.4s, v4.4s, v0.s[1]\n" - "fmla v15.4s, v4.4s, v0.s[2]\n" - "fmla v18.4s, v4.4s, v0.s[3]\n" - "fmla v21.4s, v4.4s, v1.s[0]\n" - "fmla v24.4s, v4.4s, v1.s[1]\n" - "fmla v27.4s, v4.4s, v1.s[2]\n" - "fmla v30.4s, v4.4s, v1.s[3]\n" - "fmla v10.4s, v5.4s, v0.s[0]\n" - "fmla v13.4s, v5.4s, v0.s[1]\n" - "fmla v16.4s, v5.4s, v0.s[2]\n" - "fmla v19.4s, v5.4s, v0.s[3]\n" - "fmla v22.4s, v5.4s, v1.s[0]\n" - "fmla v25.4s, v5.4s, v1.s[1]\n" - "fmla v28.4s, v5.4s, v1.s[2]\n" - "fmla v31.4s, v5.4s, v1.s[3]\n" - "bne 5b\n" - "6:" // multiply loop done - "subs x22, x22, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp deleted file mode 100644 index 8ba36cb87d..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include - -namespace arm_gemm { - -void a64_interleaved_fp32_mla_8x12_x1( - const float *Apanel, const float *Bpanel, - float *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const float *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/1) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x21, %x[Apanel]\n" - "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x21\n" - "cmp x19, #0x4\n" - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x0]\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x40]\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x40]\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x80]\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "prfm pldl1keep, [x20, #0xc0]\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x100]\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0xc0]\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "prfm pldl1keep, [x20, #0x140]\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q2, [x20, #0x0]\n" - "ldr q3, [x20, #0x10]\n" - "ldr q4, [x20, #0x20]\n" - "blt 4f\n" - "3:" // main loop head - "fmla v8.4s, v2.4s, v0.s[0]\n" - "fmla v11.4s, v2.4s, v0.s[1]\n" - "sub x19, x19, #0x4\n" - "fmla v14.4s, v2.4s, v0.s[2]\n" - "fmla v17.4s, v2.4s, v0.s[3]\n" - "cmp x19, #0x4\n" - "fmla v20.4s, v2.4s, v1.s[0]\n" - "fmla v23.4s, v2.4s, v1.s[1]\n" - "prfm pldl1keep, [%x[Apanel], #0x100]\n" - "fmla v26.4s, v2.4s, v1.s[2]\n" - "fmla v29.4s, v2.4s, v1.s[3]\n" - "ldr q2, [x20, #0x30]\n" - "fmla v9.4s, v3.4s, v0.s[0]\n" - "fmla v12.4s, v3.4s, v0.s[1]\n" - "prfm pldl1keep, [x20, #0x180]\n" - "fmla v15.4s, v3.4s, v0.s[2]\n" - "fmla v18.4s, v3.4s, v0.s[3]\n" - "prfm pldl1keep, [x20, #0x1c0]\n" - "fmla v21.4s, v3.4s, v1.s[0]\n" - "fmla v24.4s, v3.4s, v1.s[1]\n" - "prfm pldl1keep, [%x[Apanel], #0x140]\n" - "fmla v27.4s, v3.4s, v1.s[2]\n" - "fmla v30.4s, v3.4s, v1.s[3]\n" - "ldr q3, [x20, #0x40]\n" - "fmla v10.4s, v4.4s, v0.s[0]\n" - "fmla v13.4s, v4.4s, v0.s[1]\n" - "prfm pldl1keep, [x20, #0x200]\n" - "fmla v16.4s, v4.4s, v0.s[2]\n" - "fmla v19.4s, v4.4s, v0.s[3]\n" - "ldr q0, [%x[Apanel], #0x20]\n" - "fmla v22.4s, v4.4s, v1.s[0]\n" - "fmla v25.4s, v4.4s, v1.s[1]\n" - "fmla v28.4s, v4.4s, v1.s[2]\n" - "fmla v31.4s, v4.4s, v1.s[3]\n" - "ldr q1, [%x[Apanel], #0x30]\n" - "ldr q4, [x20, #0x50]\n" - "fmla v8.4s, v2.4s, v0.s[0]\n" - "fmla v11.4s, v2.4s, v0.s[1]\n" - "fmla v14.4s, v2.4s, v0.s[2]\n" - "fmla v17.4s, v2.4s, v0.s[3]\n" - "fmla v20.4s, v2.4s, v1.s[0]\n" - "fmla v23.4s, v2.4s, v1.s[1]\n" - "fmla v26.4s, v2.4s, v1.s[2]\n" - "fmla v29.4s, v2.4s, v1.s[3]\n" - "ldr q2, [x20, #0x60]\n" - "fmla v9.4s, v3.4s, v0.s[0]\n" - "fmla v12.4s, v3.4s, v0.s[1]\n" - "fmla v15.4s, v3.4s, v0.s[2]\n" - "fmla v18.4s, v3.4s, v0.s[3]\n" - "fmla v21.4s, v3.4s, v1.s[0]\n" - "fmla v24.4s, v3.4s, v1.s[1]\n" - "fmla v27.4s, v3.4s, v1.s[2]\n" - "fmla v30.4s, v3.4s, v1.s[3]\n" - "ldr q3, [x20, #0x70]\n" - "fmla v10.4s, v4.4s, v0.s[0]\n" - "fmla v13.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v0.s[2]\n" - "fmla v19.4s, v4.4s, v0.s[3]\n" - "ldr q0, [%x[Apanel], #0x40]\n" - "fmla v22.4s, v4.4s, v1.s[0]\n" - "fmla v25.4s, v4.4s, v1.s[1]\n" - "fmla v28.4s, v4.4s, v1.s[2]\n" - "fmla v31.4s, v4.4s, v1.s[3]\n" - "ldr q1, [%x[Apanel], #0x50]\n" - "ldr q4, [x20, #0x80]\n" - "fmla v8.4s, v2.4s, v0.s[0]\n" - "fmla v11.4s, v2.4s, v0.s[1]\n" - "fmla v14.4s, v2.4s, v0.s[2]\n" - "fmla v17.4s, v2.4s, v0.s[3]\n" - "fmla v20.4s, v2.4s, v1.s[0]\n" - "fmla v23.4s, v2.4s, v1.s[1]\n" - "fmla v26.4s, v2.4s, v1.s[2]\n" - "fmla v29.4s, v2.4s, v1.s[3]\n" - "ldr q2, [x20, #0x90]\n" - "fmla v9.4s, v3.4s, v0.s[0]\n" - "fmla v12.4s, v3.4s, v0.s[1]\n" - "fmla v15.4s, v3.4s, v0.s[2]\n" - "fmla v18.4s, v3.4s, v0.s[3]\n" - "fmla v21.4s, v3.4s, v1.s[0]\n" - "fmla v24.4s, v3.4s, v1.s[1]\n" - "fmla v27.4s, v3.4s, v1.s[2]\n" - "fmla v30.4s, v3.4s, v1.s[3]\n" - "ldr q3, [x20, #0xa0]\n" - "fmla v10.4s, v4.4s, v0.s[0]\n" - "fmla v13.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v0.s[2]\n" - "fmla v19.4s, v4.4s, v0.s[3]\n" - "ldr q0, [%x[Apanel], #0x60]\n" - "fmla v22.4s, v4.4s, v1.s[0]\n" - "fmla v25.4s, v4.4s, v1.s[1]\n" - "fmla v28.4s, v4.4s, v1.s[2]\n" - "fmla v31.4s, v4.4s, v1.s[3]\n" - "ldr q1, [%x[Apanel], #0x70]\n" - "ldr q4, [x20, #0xb0]\n" - "add %x[Apanel], %x[Apanel], #0x80\n" - "add x20, x20, #0xc0\n" - "fmla v8.4s, v2.4s, v0.s[0]\n" - "fmla v11.4s, v2.4s, v0.s[1]\n" - "fmla v14.4s, v2.4s, v0.s[2]\n" - "fmla v17.4s, v2.4s, v0.s[3]\n" - "fmla v20.4s, v2.4s, v1.s[0]\n" - "fmla v23.4s, v2.4s, v1.s[1]\n" - "fmla v26.4s, v2.4s, v1.s[2]\n" - "fmla v29.4s, v2.4s, v1.s[3]\n" - "ldr q2, [x20, #0x0]\n" - "fmla v9.4s, v3.4s, v0.s[0]\n" - "fmla v12.4s, v3.4s, v0.s[1]\n" - "fmla v15.4s, v3.4s, v0.s[2]\n" - "fmla v18.4s, v3.4s, v0.s[3]\n" - "fmla v21.4s, v3.4s, v1.s[0]\n" - "fmla v24.4s, v3.4s, v1.s[1]\n" - "fmla v27.4s, v3.4s, v1.s[2]\n" - "fmla v30.4s, v3.4s, v1.s[3]\n" - "ldr q3, [x20, #0x10]\n" - "fmla v10.4s, v4.4s, v0.s[0]\n" - "fmla v13.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v0.s[2]\n" - "fmla v19.4s, v4.4s, v0.s[3]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "fmla v22.4s, v4.4s, v1.s[0]\n" - "fmla v25.4s, v4.4s, v1.s[1]\n" - "fmla v28.4s, v4.4s, v1.s[2]\n" - "fmla v31.4s, v4.4s, v1.s[3]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q4, [x20, #0x20]\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla v8.4s, v2.4s, v0.s[0]\n" - "fmla v11.4s, v2.4s, v0.s[1]\n" - "add x20, x20, #0x30\n" - "fmla v14.4s, v2.4s, v0.s[2]\n" - "fmla v17.4s, v2.4s, v0.s[3]\n" - "fmla v20.4s, v2.4s, v1.s[0]\n" - "fmla v23.4s, v2.4s, v1.s[1]\n" - "fmla v26.4s, v2.4s, v1.s[2]\n" - "fmla v29.4s, v2.4s, v1.s[3]\n" - "fmla v9.4s, v3.4s, v0.s[0]\n" - "fmla v12.4s, v3.4s, v0.s[1]\n" - "fmla v15.4s, v3.4s, v0.s[2]\n" - "fmla v18.4s, v3.4s, v0.s[3]\n" - "fmla v21.4s, v3.4s, v1.s[0]\n" - "fmla v24.4s, v3.4s, v1.s[1]\n" - "fmla v27.4s, v3.4s, v1.s[2]\n" - "fmla v30.4s, v3.4s, v1.s[3]\n" - "fmla v10.4s, v4.4s, v0.s[0]\n" - "fmla v13.4s, v4.4s, v0.s[1]\n" - "fmla v16.4s, v4.4s, v0.s[2]\n" - "fmla v19.4s, v4.4s, v0.s[3]\n" - "fmla v22.4s, v4.4s, v1.s[0]\n" - "fmla v25.4s, v4.4s, v1.s[1]\n" - "fmla v28.4s, v4.4s, v1.s[2]\n" - "fmla v31.4s, v4.4s, v1.s[3]\n" - "cbz x19, 6f\n" - "5:" // odd loop - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "subs x19, x19, #0x1\n" - "ldr q5, [x20, #0x0]\n" - "ldr q6, [x20, #0x10]\n" - "fmla v8.4s, v5.4s, v0.s[0]\n" - "ldr q7, [x20, #0x20]\n" - "fmla v11.4s, v5.4s, v0.s[1]\n" - "fmla v14.4s, v5.4s, v0.s[2]\n" - "fmla v17.4s, v5.4s, v0.s[3]\n" - "fmla v20.4s, v5.4s, v1.s[0]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "fmla v23.4s, v5.4s, v1.s[1]\n" - "fmla v26.4s, v5.4s, v1.s[2]\n" - "add x20, x20, #0x30\n" - "fmla v29.4s, v5.4s, v1.s[3]\n" - "fmla v9.4s, v6.4s, v0.s[0]\n" - "fmla v12.4s, v6.4s, v0.s[1]\n" - "fmla v15.4s, v6.4s, v0.s[2]\n" - "fmla v18.4s, v6.4s, v0.s[3]\n" - "fmla v21.4s, v6.4s, v1.s[0]\n" - "fmla v24.4s, v6.4s, v1.s[1]\n" - "fmla v27.4s, v6.4s, v1.s[2]\n" - "fmla v30.4s, v6.4s, v1.s[3]\n" - "fmla v10.4s, v7.4s, v0.s[0]\n" - "fmla v13.4s, v7.4s, v0.s[1]\n" - "fmla v16.4s, v7.4s, v0.s[2]\n" - "fmla v19.4s, v7.4s, v0.s[3]\n" - "fmla v22.4s, v7.4s, v1.s[0]\n" - "fmla v25.4s, v7.4s, v1.s[1]\n" - "fmla v28.4s, v7.4s, v1.s[2]\n" - "fmla v31.4s, v7.4s, v1.s[3]\n" - "bne 5b\n" - "6:" // multiply loop done - "subs x22, x22, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp deleted file mode 100644 index bc6b9931e1..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ -#include "../std_transforms_fixed.hpp" -#include "../performance_parameters.hpp" - -#define ARGLIST \ - const int8_t *, const int8_t *, \ - int32_t *, int, int, int - -namespace arm_gemm -{ -// Actual kernel implementations -void a64_interleaved_s8s32_dot_8x12( ARGLIST ); -void a64_interleaved_s8s32_dot_8x12_a55( ARGLIST ); -void a64_interleaved_s8s32_dot_8x12_x1( ARGLIST ); - -class cls_a64_interleaved_s8s32_dot_8x12 -{ -public: - typedef int8_t operand_type; - typedef int32_t result_type; - - typedef void (*kern_type)( ARGLIST ); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return 12; - } - - static unsigned int stripe_width() - { - return 4; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - - StdTransformsFixed transforms = {}; - StdTransformsFixed transforms_quantized = {}; - template - static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) - { - - if (std::is_same::value) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 15.361, 0.9341, 0.1636 }; - default: - return { 29.0698, 3.9793, 0.4003 }; - } - } - - return { 1.0 }; - } - - // Default to the generic kernel - kern_type kernel=a64_interleaved_s8s32_dot_8x12; - cls_a64_interleaved_s8s32_dot_8x12(const CPUInfo *ci) - { - switch(ci->get_cpu_model()) { - default: - break; - case CPUModel::A55r1: - kernel=a64_interleaved_s8s32_dot_8x12_a55; - break; - case CPUModel::X1: - kernel=a64_interleaved_s8s32_dot_8x12_x1; - break; - } - } -}; - -} // namespace arm_gemm - -#undef ARGLIST - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp deleted file mode 100644 index 3acd61c88c..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include -#include - -namespace arm_gemm { - -void a64_interleaved_s8s32_dot_8x12_a55( - const int8_t *Apanel, const int8_t *Bpanel, - int32_t *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const int8_t *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/4) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x27, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x26, %x[Apanel]\n" - "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x24, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x26\n" - "cmp x24, #0x2\n" - "movi v8.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.4s, #0x0\n" - "prfm pldl1keep, [x25, #0x0]\n" - "movi v11.4s, #0x0\n" - "prfm pldl1keep, [x25, #0x40]\n" - "movi v12.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x40]\n" - "movi v13.4s, #0x0\n" - "prfm pldl1keep, [x25, #0x80]\n" - "movi v14.4s, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v15.4s, #0x0\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "movi v16.4s, #0x0\n" - "ldr q4, [x25, #0x0]\n" - "movi v17.4s, #0x0\n" - "ldr q5, [x25, #0x10]\n" - "movi v18.4s, #0x0\n" - "ldr q6, [x25, #0x20]\n" - "movi v19.4s, #0x0\n" - "movi v20.4s, #0x0\n" - "movi v21.4s, #0x0\n" - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - "blt 4f\n" - "3:" // main loop head - ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n" - "ldr d2, [%x[Apanel], #0x20]\n" - "ldr x23, [%x[Apanel], #0x28]\n" - ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n" - "ldr d3, [%x[Apanel], #0x30]\n" - ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n" - "ldr x19, [%x[Apanel], #0x38]\n" - ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x22, [x25, #0x38]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - "ldr x20, [x25, #0x48]\n" - ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n" - "ldr x21, [x25, #0x58]\n" - ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n" - "ldr d4, [x25, #0x30]\n" - ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n" - "mov v2.d[1], x23\n" - ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n" - "mov v3.d[1], x19\n" - ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n" - "mov v4.d[1], x22\n" - ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n" - "prfm pldl1keep, [x25, #0x100]\n" - ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n" - "prfm pldl1keep, [x25, #0x140]\n" - ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n" - "ldr d5, [x25, #0x40]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - "mov v5.d[1], x20\n" - ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n" - "ldr x20, [%x[Apanel], #0x8]\n" - ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n" - "ldr x19, [%x[Apanel], #0x18]\n" - ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n" - "ldr d0, [%x[Apanel], #0x0]\n" - ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - "sub x24, x24, #0x2\n" - ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n" - "cmp x24, #0x2\n" - ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n" - "mov v0.d[1], x20\n" - ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n" - "ldr d6, [x25, #0x50]\n" - "mov v6.d[1], x21\n" - "add x25, x25, #0x60\n" - ".inst 0x4f82e088 // sdot v8.4s, v4.16b, v2.4b[0]\n" - "ldr d1, [%x[Apanel], #0x10]\n" - ".inst 0x4fa2e08b // sdot v11.4s, v4.16b, v2.4b[1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0x4f82e88e // sdot v14.4s, v4.16b, v2.4b[2]\n" - "ldr x20, [x25, #0x18]\n" - ".inst 0x4fa2e891 // sdot v17.4s, v4.16b, v2.4b[3]\n" - "ldr x21, [x25, #0x28]\n" - ".inst 0x4f83e094 // sdot v20.4s, v4.16b, v3.4b[0]\n" - "mov v1.d[1], x19\n" - ".inst 0x4fa3e097 // sdot v23.4s, v4.16b, v3.4b[1]\n" - ".inst 0x4f83e89a // sdot v26.4s, v4.16b, v3.4b[2]\n" - ".inst 0x4fa3e89d // sdot v29.4s, v4.16b, v3.4b[3]\n" - "ldr d4, [x25, #0x0]\n" - ".inst 0x4f82e0a9 // sdot v9.4s, v5.16b, v2.4b[0]\n" - "mov v4.d[1], x22\n" - ".inst 0x4fa2e0ac // sdot v12.4s, v5.16b, v2.4b[1]\n" - ".inst 0x4f82e8af // sdot v15.4s, v5.16b, v2.4b[2]\n" - ".inst 0x4fa2e8b2 // sdot v18.4s, v5.16b, v2.4b[3]\n" - ".inst 0x4f83e0b5 // sdot v21.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4fa3e0b8 // sdot v24.4s, v5.16b, v3.4b[1]\n" - ".inst 0x4f83e8bb // sdot v27.4s, v5.16b, v3.4b[2]\n" - ".inst 0x4fa3e8be // sdot v30.4s, v5.16b, v3.4b[3]\n" - "ldr d5, [x25, #0x10]\n" - ".inst 0x4f82e0ca // sdot v10.4s, v6.16b, v2.4b[0]\n" - "mov v5.d[1], x20\n" - ".inst 0x4fa2e0cd // sdot v13.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4fa2e8d3 // sdot v19.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4fa3e0d9 // sdot v25.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4f83e8dc // sdot v28.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n" - "ldr d6, [x25, #0x20]\n" - "mov v6.d[1], x21\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x20\n" - ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n" - "add x25, x25, #0x30\n" - ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n" - ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n" - ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n" - "cbz x24, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "ldr q7, [x25, #0x0]\n" - ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" - "ldr q4, [x25, #0x10]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q5, [x25, #0x20]\n" - ".inst 0x4f80e8ee // sdot v14.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4fa0e8f1 // sdot v17.4s, v7.16b, v0.4b[3]\n" - "add x25, x25, #0x30\n" - ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4fa1e0f7 // sdot v23.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4f81e8fa // sdot v26.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4fa1e8fd // sdot v29.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4fa0e08c // sdot v12.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4f80e88f // sdot v15.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4fa1e098 // sdot v24.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4f81e89b // sdot v27.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4fa1e89e // sdot v30.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4f80e0aa // sdot v10.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4fa0e0ad // sdot v13.4s, v5.16b, v0.4b[1]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4fa1e0b9 // sdot v25.4s, v5.16b, v1.4b[1]\n" - ".inst 0x4f81e8bc // sdot v28.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4fa1e8bf // sdot v31.4s, v5.16b, v1.4b[3]\n" - "5:" // multiply loop done - "subs x27, x27, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp deleted file mode 100644 index 267f62ae8a..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include -#include - -namespace arm_gemm { - -void a64_interleaved_s8s32_dot_8x12( - const int8_t *Apanel, const int8_t *Bpanel, - int32_t *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const int8_t *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/4) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x21, %x[Apanel]\n" - "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x21\n" - "cmp x19, #0x2\n" - "movi v8.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.4s, #0x0\n" - "movi v11.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x0]\n" - "movi v12.4s, #0x0\n" - "movi v13.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x40]\n" - "movi v14.4s, #0x0\n" - "movi v15.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x40]\n" - "movi v16.4s, #0x0\n" - "movi v17.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x80]\n" - "movi v18.4s, #0x0\n" - "movi v19.4s, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v20.4s, #0x0\n" - "movi v21.4s, #0x0\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "ldr q4, [x20, #0x0]\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "ldr q5, [x20, #0x10]\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "ldr q6, [x20, #0x20]\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - "blt 4f\n" - "3:" // main loop head - ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n" - "ldr q2, [%x[Apanel], #0x20]\n" - ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n" - "ldr q3, [%x[Apanel], #0x30]\n" - ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - "sub x19, x19, #0x2\n" - ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n" - "ldr q4, [x20, #0x30]\n" - ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n" - "cmp x19, #0x2\n" - ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n" - "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n" - "ldr q5, [x20, #0x40]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n" - "prfm pldl1keep, [x20, #0x100]\n" - ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n" - "prfm pldl1keep, [x20, #0x140]\n" - ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x20, #0x50]\n" - "add x20, x20, #0x60\n" - ".inst 0x4f82e088 // sdot v8.4s, v4.16b, v2.4b[0]\n" - ".inst 0x4fa2e08b // sdot v11.4s, v4.16b, v2.4b[1]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - ".inst 0x4f82e88e // sdot v14.4s, v4.16b, v2.4b[2]\n" - ".inst 0x4fa2e891 // sdot v17.4s, v4.16b, v2.4b[3]\n" - ".inst 0x4f83e094 // sdot v20.4s, v4.16b, v3.4b[0]\n" - ".inst 0x4fa3e097 // sdot v23.4s, v4.16b, v3.4b[1]\n" - ".inst 0x4f83e89a // sdot v26.4s, v4.16b, v3.4b[2]\n" - ".inst 0x4fa3e89d // sdot v29.4s, v4.16b, v3.4b[3]\n" - "ldr q4, [x20, #0x0]\n" - ".inst 0x4f82e0a9 // sdot v9.4s, v5.16b, v2.4b[0]\n" - ".inst 0x4fa2e0ac // sdot v12.4s, v5.16b, v2.4b[1]\n" - ".inst 0x4f82e8af // sdot v15.4s, v5.16b, v2.4b[2]\n" - ".inst 0x4fa2e8b2 // sdot v18.4s, v5.16b, v2.4b[3]\n" - ".inst 0x4f83e0b5 // sdot v21.4s, v5.16b, v3.4b[0]\n" - ".inst 0x4fa3e0b8 // sdot v24.4s, v5.16b, v3.4b[1]\n" - ".inst 0x4f83e8bb // sdot v27.4s, v5.16b, v3.4b[2]\n" - ".inst 0x4fa3e8be // sdot v30.4s, v5.16b, v3.4b[3]\n" - "ldr q5, [x20, #0x10]\n" - ".inst 0x4f82e0ca // sdot v10.4s, v6.16b, v2.4b[0]\n" - ".inst 0x4fa2e0cd // sdot v13.4s, v6.16b, v2.4b[1]\n" - ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x4fa2e8d3 // sdot v19.4s, v6.16b, v2.4b[3]\n" - ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x4fa3e0d9 // sdot v25.4s, v6.16b, v3.4b[1]\n" - ".inst 0x4f83e8dc // sdot v28.4s, v6.16b, v3.4b[2]\n" - ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x20, #0x20]\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x20\n" - ".inst 0x4f80e088 // sdot v8.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4fa0e08b // sdot v11.4s, v4.16b, v0.4b[1]\n" - "add x20, x20, #0x30\n" - ".inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4fa0e891 // sdot v17.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4fa1e89d // sdot v29.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4f80e0a9 // sdot v9.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4fa0e0ac // sdot v12.4s, v5.16b, v0.4b[1]\n" - ".inst 0x4f80e8af // sdot v15.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4fa0e8b2 // sdot v18.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4fa1e0b8 // sdot v24.4s, v5.16b, v1.4b[1]\n" - ".inst 0x4f81e8bb // sdot v27.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4fa1e8be // sdot v30.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4fa0e0cd // sdot v13.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4f80e8d0 // sdot v16.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4fa0e8d3 // sdot v19.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4f81e8dc // sdot v28.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4fa1e8df // sdot v31.4s, v6.16b, v1.4b[3]\n" - "cbz x19, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "ldr q7, [x20, #0x0]\n" - "ldr q4, [x20, #0x10]\n" - ".inst 0x4f80e0e8 // sdot v8.4s, v7.16b, v0.4b[0]\n" - "ldr q5, [x20, #0x20]\n" - ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4f80e8ee // sdot v14.4s, v7.16b, v0.4b[2]\n" - "add x20, x20, #0x30\n" - ".inst 0x4fa0e8f1 // sdot v17.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4f81e0f4 // sdot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4fa1e0f7 // sdot v23.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4f81e8fa // sdot v26.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4fa1e8fd // sdot v29.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4fa0e08c // sdot v12.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4f80e88f // sdot v15.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4f81e095 // sdot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4fa1e098 // sdot v24.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4f81e89b // sdot v27.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4fa1e89e // sdot v30.4s, v4.16b, v1.4b[3]\n" - ".inst 0x4f80e0aa // sdot v10.4s, v5.16b, v0.4b[0]\n" - ".inst 0x4fa0e0ad // sdot v13.4s, v5.16b, v0.4b[1]\n" - ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4f81e0b6 // sdot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4fa1e0b9 // sdot v25.4s, v5.16b, v1.4b[1]\n" - ".inst 0x4f81e8bc // sdot v28.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4fa1e8bf // sdot v31.4s, v5.16b, v1.4b[3]\n" - "5:" // multiply loop done - "subs x22, x22, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp deleted file mode 100644 index 4804c059a3..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include -#include - -namespace arm_gemm { - -void a64_interleaved_s8s32_dot_8x12_x1( - const int8_t *Apanel, const int8_t *Bpanel, - int32_t *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const int8_t *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/4) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x21, %x[Apanel]\n" - "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x21\n" - "cmp x19, #0x2\n" - "movi v8.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.4s, #0x0\n" - "movi v11.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x0]\n" - "movi v12.4s, #0x0\n" - "movi v13.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x40]\n" - "movi v14.4s, #0x0\n" - "movi v15.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x40]\n" - "movi v16.4s, #0x0\n" - "movi v17.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x80]\n" - "movi v18.4s, #0x0\n" - "movi v19.4s, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v20.4s, #0x0\n" - "movi v21.4s, #0x0\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "ldr q2, [x20, #0x0]\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "ldr q3, [x20, #0x10]\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "ldr q4, [x20, #0x20]\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - "blt 4f\n" - "3:" // main loop head - ".inst 0x4f80e048 // sdot v8.4s, v2.16b, v0.4b[0]\n" - ".inst 0x4fa0e04b // sdot v11.4s, v2.16b, v0.4b[1]\n" - "sub x19, x19, #0x2\n" - ".inst 0x4f80e84e // sdot v14.4s, v2.16b, v0.4b[2]\n" - ".inst 0x4fa0e851 // sdot v17.4s, v2.16b, v0.4b[3]\n" - "cmp x19, #0x2\n" - ".inst 0x4f81e054 // sdot v20.4s, v2.16b, v1.4b[0]\n" - ".inst 0x4fa1e057 // sdot v23.4s, v2.16b, v1.4b[1]\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - ".inst 0x4f81e85a // sdot v26.4s, v2.16b, v1.4b[2]\n" - ".inst 0x4fa1e85d // sdot v29.4s, v2.16b, v1.4b[3]\n" - "ldr q2, [x20, #0x30]\n" - ".inst 0x4f80e069 // sdot v9.4s, v3.16b, v0.4b[0]\n" - ".inst 0x4fa0e06c // sdot v12.4s, v3.16b, v0.4b[1]\n" - "prfm pldl1keep, [x20, #0x100]\n" - ".inst 0x4f80e86f // sdot v15.4s, v3.16b, v0.4b[2]\n" - ".inst 0x4fa0e872 // sdot v18.4s, v3.16b, v0.4b[3]\n" - "prfm pldl1keep, [x20, #0x140]\n" - ".inst 0x4f81e075 // sdot v21.4s, v3.16b, v1.4b[0]\n" - ".inst 0x4fa1e078 // sdot v24.4s, v3.16b, v1.4b[1]\n" - ".inst 0x4f81e87b // sdot v27.4s, v3.16b, v1.4b[2]\n" - ".inst 0x4fa1e87e // sdot v30.4s, v3.16b, v1.4b[3]\n" - "ldr q3, [x20, #0x40]\n" - ".inst 0x4f80e08a // sdot v10.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4f80e890 // sdot v16.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]\n" - "ldr q0, [%x[Apanel], #0x20]\n" - ".inst 0x4f81e096 // sdot v22.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4f81e89c // sdot v28.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4fa1e89f // sdot v31.4s, v4.16b, v1.4b[3]\n" - "ldr q1, [%x[Apanel], #0x30]\n" - "ldr q4, [x20, #0x50]\n" - "add %x[Apanel], %x[Apanel], #0x40\n" - "add x20, x20, #0x60\n" - ".inst 0x4f80e048 // sdot v8.4s, v2.16b, v0.4b[0]\n" - ".inst 0x4fa0e04b // sdot v11.4s, v2.16b, v0.4b[1]\n" - ".inst 0x4f80e84e // sdot v14.4s, v2.16b, v0.4b[2]\n" - ".inst 0x4fa0e851 // sdot v17.4s, v2.16b, v0.4b[3]\n" - ".inst 0x4f81e054 // sdot v20.4s, v2.16b, v1.4b[0]\n" - ".inst 0x4fa1e057 // sdot v23.4s, v2.16b, v1.4b[1]\n" - ".inst 0x4f81e85a // sdot v26.4s, v2.16b, v1.4b[2]\n" - ".inst 0x4fa1e85d // sdot v29.4s, v2.16b, v1.4b[3]\n" - "ldr q2, [x20, #0x0]\n" - ".inst 0x4f80e069 // sdot v9.4s, v3.16b, v0.4b[0]\n" - ".inst 0x4fa0e06c // sdot v12.4s, v3.16b, v0.4b[1]\n" - ".inst 0x4f80e86f // sdot v15.4s, v3.16b, v0.4b[2]\n" - ".inst 0x4fa0e872 // sdot v18.4s, v3.16b, v0.4b[3]\n" - ".inst 0x4f81e075 // sdot v21.4s, v3.16b, v1.4b[0]\n" - ".inst 0x4fa1e078 // sdot v24.4s, v3.16b, v1.4b[1]\n" - ".inst 0x4f81e87b // sdot v27.4s, v3.16b, v1.4b[2]\n" - ".inst 0x4fa1e87e // sdot v30.4s, v3.16b, v1.4b[3]\n" - "ldr q3, [x20, #0x10]\n" - ".inst 0x4f80e08a // sdot v10.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4f80e890 // sdot v16.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - ".inst 0x4f81e096 // sdot v22.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4f81e89c // sdot v28.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4fa1e89f // sdot v31.4s, v4.16b, v1.4b[3]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q4, [x20, #0x20]\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x20\n" - ".inst 0x4f80e048 // sdot v8.4s, v2.16b, v0.4b[0]\n" - ".inst 0x4fa0e04b // sdot v11.4s, v2.16b, v0.4b[1]\n" - "add x20, x20, #0x30\n" - ".inst 0x4f80e84e // sdot v14.4s, v2.16b, v0.4b[2]\n" - ".inst 0x4fa0e851 // sdot v17.4s, v2.16b, v0.4b[3]\n" - ".inst 0x4f81e054 // sdot v20.4s, v2.16b, v1.4b[0]\n" - ".inst 0x4fa1e057 // sdot v23.4s, v2.16b, v1.4b[1]\n" - ".inst 0x4f81e85a // sdot v26.4s, v2.16b, v1.4b[2]\n" - ".inst 0x4fa1e85d // sdot v29.4s, v2.16b, v1.4b[3]\n" - ".inst 0x4f80e069 // sdot v9.4s, v3.16b, v0.4b[0]\n" - ".inst 0x4fa0e06c // sdot v12.4s, v3.16b, v0.4b[1]\n" - ".inst 0x4f80e86f // sdot v15.4s, v3.16b, v0.4b[2]\n" - ".inst 0x4fa0e872 // sdot v18.4s, v3.16b, v0.4b[3]\n" - ".inst 0x4f81e075 // sdot v21.4s, v3.16b, v1.4b[0]\n" - ".inst 0x4fa1e078 // sdot v24.4s, v3.16b, v1.4b[1]\n" - ".inst 0x4f81e87b // sdot v27.4s, v3.16b, v1.4b[2]\n" - ".inst 0x4fa1e87e // sdot v30.4s, v3.16b, v1.4b[3]\n" - ".inst 0x4f80e08a // sdot v10.4s, v4.16b, v0.4b[0]\n" - ".inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]\n" - ".inst 0x4f80e890 // sdot v16.4s, v4.16b, v0.4b[2]\n" - ".inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]\n" - ".inst 0x4f81e096 // sdot v22.4s, v4.16b, v1.4b[0]\n" - ".inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]\n" - ".inst 0x4f81e89c // sdot v28.4s, v4.16b, v1.4b[2]\n" - ".inst 0x4fa1e89f // sdot v31.4s, v4.16b, v1.4b[3]\n" - "cbz x19, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "ldr q5, [x20, #0x0]\n" - "ldr q6, [x20, #0x10]\n" - ".inst 0x4f80e0a8 // sdot v8.4s, v5.16b, v0.4b[0]\n" - "ldr q7, [x20, #0x20]\n" - ".inst 0x4fa0e0ab // sdot v11.4s, v5.16b, v0.4b[1]\n" - ".inst 0x4f80e8ae // sdot v14.4s, v5.16b, v0.4b[2]\n" - "add x20, x20, #0x30\n" - ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4fa1e0b7 // sdot v23.4s, v5.16b, v1.4b[1]\n" - ".inst 0x4f81e8ba // sdot v26.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4fa1e8bd // sdot v29.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4f80e0c9 // sdot v9.4s, v6.16b, v0.4b[0]\n" - ".inst 0x4fa0e0cc // sdot v12.4s, v6.16b, v0.4b[1]\n" - ".inst 0x4f80e8cf // sdot v15.4s, v6.16b, v0.4b[2]\n" - ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x4fa1e0d8 // sdot v24.4s, v6.16b, v1.4b[1]\n" - ".inst 0x4f81e8db // sdot v27.4s, v6.16b, v1.4b[2]\n" - ".inst 0x4fa1e8de // sdot v30.4s, v6.16b, v1.4b[3]\n" - ".inst 0x4f80e0ea // sdot v10.4s, v7.16b, v0.4b[0]\n" - ".inst 0x4fa0e0ed // sdot v13.4s, v7.16b, v0.4b[1]\n" - ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n" - ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4fa1e0f9 // sdot v25.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4f81e8fc // sdot v28.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4fa1e8ff // sdot v31.4s, v7.16b, v1.4b[3]\n" - "5:" // multiply loop done - "subs x22, x22, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp deleted file mode 100644 index 000cc680da..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#pragma once - -#ifdef __aarch64__ -#include "../std_transforms_fixed.hpp" -#include "../performance_parameters.hpp" - -#define ARGLIST \ - const uint8_t *, const uint8_t *, \ - uint32_t *, int, int, int - -namespace arm_gemm -{ -// Actual kernel implementations -void a64_interleaved_u8u32_dot_8x12( ARGLIST ); -void a64_interleaved_u8u32_dot_8x12_a55( ARGLIST ); -void a64_interleaved_u8u32_dot_8x12_x1( ARGLIST ); - -class cls_a64_interleaved_u8u32_dot_8x12 -{ -public: - typedef uint8_t operand_type; - typedef uint32_t result_type; - - typedef void (*kern_type)( ARGLIST ); - - /* Kernel blocking parameters */ - static constexpr unsigned int out_height() - { - return 8; - } - - static unsigned int out_width() - { - return 12; - } - - static unsigned int stripe_width() - { - return 4; - } - - static constexpr unsigned int k_unroll() - { - return 4; - } - - - StdTransformsFixed transforms = {}; - StdTransformsFixed transforms_quantized = {}; - template - static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci) - { - - if (std::is_same::value) { - switch (ci->get_cpu_model()) { - case CPUModel::A55r1: - return { 15.361, 0.9341, 0.1636 }; - default: - return { 29.0698, 3.9793, 0.4003 }; - } - } - - return { 1.0 }; - } - - // Default to the generic kernel - kern_type kernel=a64_interleaved_u8u32_dot_8x12; - cls_a64_interleaved_u8u32_dot_8x12(const CPUInfo *ci) - { - switch(ci->get_cpu_model()) { - default: - break; - case CPUModel::A55r1: - kernel=a64_interleaved_u8u32_dot_8x12_a55; - break; - case CPUModel::X1: - kernel=a64_interleaved_u8u32_dot_8x12_x1; - break; - } - } -}; - -} // namespace arm_gemm - -#undef ARGLIST - -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp deleted file mode 100644 index 7892306153..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include -#include - -namespace arm_gemm { - -void a64_interleaved_u8u32_dot_8x12_a55( - const uint8_t *Apanel, const uint8_t *Bpanel, - uint32_t *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const uint8_t *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/4) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x27, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x26, %x[Apanel]\n" - "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x24, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x26\n" - "cmp x24, #0x2\n" - "movi v8.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.4s, #0x0\n" - "prfm pldl1keep, [x25, #0x0]\n" - "movi v11.4s, #0x0\n" - "prfm pldl1keep, [x25, #0x40]\n" - "movi v12.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x40]\n" - "movi v13.4s, #0x0\n" - "prfm pldl1keep, [x25, #0x80]\n" - "movi v14.4s, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v15.4s, #0x0\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "movi v16.4s, #0x0\n" - "ldr q4, [x25, #0x0]\n" - "movi v17.4s, #0x0\n" - "ldr q5, [x25, #0x10]\n" - "movi v18.4s, #0x0\n" - "ldr q6, [x25, #0x20]\n" - "movi v19.4s, #0x0\n" - "movi v20.4s, #0x0\n" - "movi v21.4s, #0x0\n" - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - "blt 4f\n" - "3:" // main loop head - ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n" - "ldr d2, [%x[Apanel], #0x20]\n" - "ldr x23, [%x[Apanel], #0x28]\n" - ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n" - "ldr d3, [%x[Apanel], #0x30]\n" - ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n" - "ldr x19, [%x[Apanel], #0x38]\n" - ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - "ldr x22, [x25, #0x38]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - "ldr x20, [x25, #0x48]\n" - ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n" - "ldr x21, [x25, #0x58]\n" - ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n" - "ldr d4, [x25, #0x30]\n" - ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n" - "mov v2.d[1], x23\n" - ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n" - "mov v3.d[1], x19\n" - ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n" - "mov v4.d[1], x22\n" - ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n" - "prfm pldl1keep, [x25, #0x100]\n" - ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n" - "prfm pldl1keep, [x25, #0x140]\n" - ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n" - "ldr d5, [x25, #0x40]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - "mov v5.d[1], x20\n" - ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n" - "ldr x20, [%x[Apanel], #0x8]\n" - ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n" - "ldr x19, [%x[Apanel], #0x18]\n" - ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n" - "ldr d0, [%x[Apanel], #0x0]\n" - ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - "sub x24, x24, #0x2\n" - ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n" - "cmp x24, #0x2\n" - ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n" - "mov v0.d[1], x20\n" - ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n" - "ldr d6, [x25, #0x50]\n" - "mov v6.d[1], x21\n" - "add x25, x25, #0x60\n" - ".inst 0x6f82e088 // udot v8.4s, v4.16b, v2.4b[0]\n" - "ldr d1, [%x[Apanel], #0x10]\n" - ".inst 0x6fa2e08b // udot v11.4s, v4.16b, v2.4b[1]\n" - "ldr x22, [x25, #0x8]\n" - ".inst 0x6f82e88e // udot v14.4s, v4.16b, v2.4b[2]\n" - "ldr x20, [x25, #0x18]\n" - ".inst 0x6fa2e891 // udot v17.4s, v4.16b, v2.4b[3]\n" - "ldr x21, [x25, #0x28]\n" - ".inst 0x6f83e094 // udot v20.4s, v4.16b, v3.4b[0]\n" - "mov v1.d[1], x19\n" - ".inst 0x6fa3e097 // udot v23.4s, v4.16b, v3.4b[1]\n" - ".inst 0x6f83e89a // udot v26.4s, v4.16b, v3.4b[2]\n" - ".inst 0x6fa3e89d // udot v29.4s, v4.16b, v3.4b[3]\n" - "ldr d4, [x25, #0x0]\n" - ".inst 0x6f82e0a9 // udot v9.4s, v5.16b, v2.4b[0]\n" - "mov v4.d[1], x22\n" - ".inst 0x6fa2e0ac // udot v12.4s, v5.16b, v2.4b[1]\n" - ".inst 0x6f82e8af // udot v15.4s, v5.16b, v2.4b[2]\n" - ".inst 0x6fa2e8b2 // udot v18.4s, v5.16b, v2.4b[3]\n" - ".inst 0x6f83e0b5 // udot v21.4s, v5.16b, v3.4b[0]\n" - ".inst 0x6fa3e0b8 // udot v24.4s, v5.16b, v3.4b[1]\n" - ".inst 0x6f83e8bb // udot v27.4s, v5.16b, v3.4b[2]\n" - ".inst 0x6fa3e8be // udot v30.4s, v5.16b, v3.4b[3]\n" - "ldr d5, [x25, #0x10]\n" - ".inst 0x6f82e0ca // udot v10.4s, v6.16b, v2.4b[0]\n" - "mov v5.d[1], x20\n" - ".inst 0x6fa2e0cd // udot v13.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6fa2e8d3 // udot v19.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6fa3e0d9 // udot v25.4s, v6.16b, v3.4b[1]\n" - ".inst 0x6f83e8dc // udot v28.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n" - "ldr d6, [x25, #0x20]\n" - "mov v6.d[1], x21\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x20\n" - ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n" - "add x25, x25, #0x30\n" - ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n" - ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n" - ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n" - "cbz x24, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "ldr q7, [x25, #0x0]\n" - ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n" - "ldr q4, [x25, #0x10]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - "ldr q5, [x25, #0x20]\n" - ".inst 0x6f80e8ee // udot v14.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6fa0e8f1 // udot v17.4s, v7.16b, v0.4b[3]\n" - "add x25, x25, #0x30\n" - ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6fa1e0f7 // udot v23.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6f81e8fa // udot v26.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6fa1e8fd // udot v29.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6f80e089 // udot v9.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6fa0e08c // udot v12.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6f80e88f // udot v15.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6fa1e098 // udot v24.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6f81e89b // udot v27.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6fa1e89e // udot v30.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6f80e0aa // udot v10.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6fa0e0ad // udot v13.4s, v5.16b, v0.4b[1]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6fa1e0b9 // udot v25.4s, v5.16b, v1.4b[1]\n" - ".inst 0x6f81e8bc // udot v28.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6fa1e8bf // udot v31.4s, v5.16b, v1.4b[3]\n" - "5:" // multiply loop done - "subs x27, x27, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp deleted file mode 100644 index 42226e90f5..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include -#include - -namespace arm_gemm { - -void a64_interleaved_u8u32_dot_8x12( - const uint8_t *Apanel, const uint8_t *Bpanel, - uint32_t *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const uint8_t *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/4) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x21, %x[Apanel]\n" - "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x21\n" - "cmp x19, #0x2\n" - "movi v8.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.4s, #0x0\n" - "movi v11.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x0]\n" - "movi v12.4s, #0x0\n" - "movi v13.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x40]\n" - "movi v14.4s, #0x0\n" - "movi v15.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x40]\n" - "movi v16.4s, #0x0\n" - "movi v17.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x80]\n" - "movi v18.4s, #0x0\n" - "movi v19.4s, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v20.4s, #0x0\n" - "movi v21.4s, #0x0\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "ldr q4, [x20, #0x0]\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "ldr q5, [x20, #0x10]\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "ldr q6, [x20, #0x20]\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - "blt 4f\n" - "3:" // main loop head - ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n" - "ldr q2, [%x[Apanel], #0x20]\n" - ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n" - "ldr q3, [%x[Apanel], #0x30]\n" - ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - "sub x19, x19, #0x2\n" - ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n" - "ldr q4, [x20, #0x30]\n" - ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n" - "cmp x19, #0x2\n" - ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n" - "add %x[Apanel], %x[Apanel], #0x40\n" - ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n" - "ldr q5, [x20, #0x40]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n" - "prfm pldl1keep, [x20, #0x100]\n" - ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n" - "prfm pldl1keep, [x20, #0x140]\n" - ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n" - "ldr q6, [x20, #0x50]\n" - "add x20, x20, #0x60\n" - ".inst 0x6f82e088 // udot v8.4s, v4.16b, v2.4b[0]\n" - ".inst 0x6fa2e08b // udot v11.4s, v4.16b, v2.4b[1]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - ".inst 0x6f82e88e // udot v14.4s, v4.16b, v2.4b[2]\n" - ".inst 0x6fa2e891 // udot v17.4s, v4.16b, v2.4b[3]\n" - ".inst 0x6f83e094 // udot v20.4s, v4.16b, v3.4b[0]\n" - ".inst 0x6fa3e097 // udot v23.4s, v4.16b, v3.4b[1]\n" - ".inst 0x6f83e89a // udot v26.4s, v4.16b, v3.4b[2]\n" - ".inst 0x6fa3e89d // udot v29.4s, v4.16b, v3.4b[3]\n" - "ldr q4, [x20, #0x0]\n" - ".inst 0x6f82e0a9 // udot v9.4s, v5.16b, v2.4b[0]\n" - ".inst 0x6fa2e0ac // udot v12.4s, v5.16b, v2.4b[1]\n" - ".inst 0x6f82e8af // udot v15.4s, v5.16b, v2.4b[2]\n" - ".inst 0x6fa2e8b2 // udot v18.4s, v5.16b, v2.4b[3]\n" - ".inst 0x6f83e0b5 // udot v21.4s, v5.16b, v3.4b[0]\n" - ".inst 0x6fa3e0b8 // udot v24.4s, v5.16b, v3.4b[1]\n" - ".inst 0x6f83e8bb // udot v27.4s, v5.16b, v3.4b[2]\n" - ".inst 0x6fa3e8be // udot v30.4s, v5.16b, v3.4b[3]\n" - "ldr q5, [x20, #0x10]\n" - ".inst 0x6f82e0ca // udot v10.4s, v6.16b, v2.4b[0]\n" - ".inst 0x6fa2e0cd // udot v13.4s, v6.16b, v2.4b[1]\n" - ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n" - ".inst 0x6fa2e8d3 // udot v19.4s, v6.16b, v2.4b[3]\n" - ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n" - ".inst 0x6fa3e0d9 // udot v25.4s, v6.16b, v3.4b[1]\n" - ".inst 0x6f83e8dc // udot v28.4s, v6.16b, v3.4b[2]\n" - ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n" - "ldr q6, [x20, #0x20]\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x20\n" - ".inst 0x6f80e088 // udot v8.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6fa0e08b // udot v11.4s, v4.16b, v0.4b[1]\n" - "add x20, x20, #0x30\n" - ".inst 0x6f80e88e // udot v14.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6fa0e891 // udot v17.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6f81e89a // udot v26.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6fa1e89d // udot v29.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6f80e0a9 // udot v9.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6fa0e0ac // udot v12.4s, v5.16b, v0.4b[1]\n" - ".inst 0x6f80e8af // udot v15.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6fa0e8b2 // udot v18.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6fa1e0b8 // udot v24.4s, v5.16b, v1.4b[1]\n" - ".inst 0x6f81e8bb // udot v27.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6fa1e8be // udot v30.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6fa0e0cd // udot v13.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6f80e8d0 // udot v16.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6fa0e8d3 // udot v19.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6f81e8dc // udot v28.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6fa1e8df // udot v31.4s, v6.16b, v1.4b[3]\n" - "cbz x19, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "ldr q7, [x20, #0x0]\n" - "ldr q4, [x20, #0x10]\n" - ".inst 0x6f80e0e8 // udot v8.4s, v7.16b, v0.4b[0]\n" - "ldr q5, [x20, #0x20]\n" - ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6f80e8ee // udot v14.4s, v7.16b, v0.4b[2]\n" - "add x20, x20, #0x30\n" - ".inst 0x6fa0e8f1 // udot v17.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6f81e0f4 // udot v20.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6fa1e0f7 // udot v23.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6f81e8fa // udot v26.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6fa1e8fd // udot v29.4s, v7.16b, v1.4b[3]\n" - ".inst 0x6f80e089 // udot v9.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6fa0e08c // udot v12.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6f80e88f // udot v15.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6f81e095 // udot v21.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6fa1e098 // udot v24.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6f81e89b // udot v27.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6fa1e89e // udot v30.4s, v4.16b, v1.4b[3]\n" - ".inst 0x6f80e0aa // udot v10.4s, v5.16b, v0.4b[0]\n" - ".inst 0x6fa0e0ad // udot v13.4s, v5.16b, v0.4b[1]\n" - ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n" - ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6f81e0b6 // udot v22.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6fa1e0b9 // udot v25.4s, v5.16b, v1.4b[1]\n" - ".inst 0x6f81e8bc // udot v28.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6fa1e8bf // udot v31.4s, v5.16b, v1.4b[3]\n" - "5:" // multiply loop done - "subs x22, x22, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp deleted file mode 100644 index 652f2bffc5..0000000000 --- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#ifdef __aarch64__ - -#include -#include - -namespace arm_gemm { - -void a64_interleaved_u8u32_dot_8x12_x1( - const uint8_t *Apanel, const uint8_t *Bpanel, - uint32_t *Cpanel, int ablocks, int bblocks, int K) { - - struct KernelArgs { - size_t bblocks = {}; - size_t K = {}; - const uint8_t *Bpanel = {}; - } ka; - - ka.bblocks = bblocks; - ka.K = (K/4) - 1; - ka.Bpanel = Bpanel; - - __asm__ __volatile__( - - "1:" // Height loop - "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n" - "mov x21, %x[Apanel]\n" - "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n" - "2:" // Width loop - "ldr x19, [%x[args_ptr], %[offsetof_K]]\n" - "mov %x[Apanel], x21\n" - "cmp x19, #0x2\n" - "movi v8.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x0]\n" - "movi v10.4s, #0x0\n" - "movi v11.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x0]\n" - "movi v12.4s, #0x0\n" - "movi v13.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x40]\n" - "movi v14.4s, #0x0\n" - "movi v15.4s, #0x0\n" - "prfm pldl1keep, [%x[Apanel], #0x40]\n" - "movi v16.4s, #0x0\n" - "movi v17.4s, #0x0\n" - "prfm pldl1keep, [x20, #0x80]\n" - "movi v18.4s, #0x0\n" - "movi v19.4s, #0x0\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "movi v20.4s, #0x0\n" - "movi v21.4s, #0x0\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "movi v22.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "ldr q2, [x20, #0x0]\n" - "movi v24.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "ldr q3, [x20, #0x10]\n" - "movi v26.4s, #0x0\n" - "movi v27.4s, #0x0\n" - "ldr q4, [x20, #0x20]\n" - "movi v28.4s, #0x0\n" - "movi v29.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v31.4s, #0x0\n" - "blt 4f\n" - "3:" // main loop head - ".inst 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n" - ".inst 0x6fa0e04b // udot v11.4s, v2.16b, v0.4b[1]\n" - "sub x19, x19, #0x2\n" - ".inst 0x6f80e84e // udot v14.4s, v2.16b, v0.4b[2]\n" - ".inst 0x6fa0e851 // udot v17.4s, v2.16b, v0.4b[3]\n" - "cmp x19, #0x2\n" - ".inst 0x6f81e054 // udot v20.4s, v2.16b, v1.4b[0]\n" - ".inst 0x6fa1e057 // udot v23.4s, v2.16b, v1.4b[1]\n" - "prfm pldl1keep, [%x[Apanel], #0x80]\n" - ".inst 0x6f81e85a // udot v26.4s, v2.16b, v1.4b[2]\n" - ".inst 0x6fa1e85d // udot v29.4s, v2.16b, v1.4b[3]\n" - "ldr q2, [x20, #0x30]\n" - ".inst 0x6f80e069 // udot v9.4s, v3.16b, v0.4b[0]\n" - ".inst 0x6fa0e06c // udot v12.4s, v3.16b, v0.4b[1]\n" - "prfm pldl1keep, [x20, #0x100]\n" - ".inst 0x6f80e86f // udot v15.4s, v3.16b, v0.4b[2]\n" - ".inst 0x6fa0e872 // udot v18.4s, v3.16b, v0.4b[3]\n" - "prfm pldl1keep, [x20, #0x140]\n" - ".inst 0x6f81e075 // udot v21.4s, v3.16b, v1.4b[0]\n" - ".inst 0x6fa1e078 // udot v24.4s, v3.16b, v1.4b[1]\n" - ".inst 0x6f81e87b // udot v27.4s, v3.16b, v1.4b[2]\n" - ".inst 0x6fa1e87e // udot v30.4s, v3.16b, v1.4b[3]\n" - "ldr q3, [x20, #0x40]\n" - ".inst 0x6f80e08a // udot v10.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6fa0e08d // udot v13.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6f80e890 // udot v16.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6fa0e893 // udot v19.4s, v4.16b, v0.4b[3]\n" - "ldr q0, [%x[Apanel], #0x20]\n" - ".inst 0x6f81e096 // udot v22.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6fa1e099 // udot v25.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6f81e89c // udot v28.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n" - "ldr q1, [%x[Apanel], #0x30]\n" - "ldr q4, [x20, #0x50]\n" - "add %x[Apanel], %x[Apanel], #0x40\n" - "add x20, x20, #0x60\n" - ".inst 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n" - ".inst 0x6fa0e04b // udot v11.4s, v2.16b, v0.4b[1]\n" - ".inst 0x6f80e84e // udot v14.4s, v2.16b, v0.4b[2]\n" - ".inst 0x6fa0e851 // udot v17.4s, v2.16b, v0.4b[3]\n" - ".inst 0x6f81e054 // udot v20.4s, v2.16b, v1.4b[0]\n" - ".inst 0x6fa1e057 // udot v23.4s, v2.16b, v1.4b[1]\n" - ".inst 0x6f81e85a // udot v26.4s, v2.16b, v1.4b[2]\n" - ".inst 0x6fa1e85d // udot v29.4s, v2.16b, v1.4b[3]\n" - "ldr q2, [x20, #0x0]\n" - ".inst 0x6f80e069 // udot v9.4s, v3.16b, v0.4b[0]\n" - ".inst 0x6fa0e06c // udot v12.4s, v3.16b, v0.4b[1]\n" - ".inst 0x6f80e86f // udot v15.4s, v3.16b, v0.4b[2]\n" - ".inst 0x6fa0e872 // udot v18.4s, v3.16b, v0.4b[3]\n" - ".inst 0x6f81e075 // udot v21.4s, v3.16b, v1.4b[0]\n" - ".inst 0x6fa1e078 // udot v24.4s, v3.16b, v1.4b[1]\n" - ".inst 0x6f81e87b // udot v27.4s, v3.16b, v1.4b[2]\n" - ".inst 0x6fa1e87e // udot v30.4s, v3.16b, v1.4b[3]\n" - "ldr q3, [x20, #0x10]\n" - ".inst 0x6f80e08a // udot v10.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6fa0e08d // udot v13.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6f80e890 // udot v16.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6fa0e893 // udot v19.4s, v4.16b, v0.4b[3]\n" - "ldr q0, [%x[Apanel], #0x0]\n" - ".inst 0x6f81e096 // udot v22.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6fa1e099 // udot v25.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6f81e89c // udot v28.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "ldr q4, [x20, #0x20]\n" - "bge 3b\n" - "4:" // main loop skip - "add %x[Apanel], %x[Apanel], #0x20\n" - ".inst 0x6f80e048 // udot v8.4s, v2.16b, v0.4b[0]\n" - ".inst 0x6fa0e04b // udot v11.4s, v2.16b, v0.4b[1]\n" - "add x20, x20, #0x30\n" - ".inst 0x6f80e84e // udot v14.4s, v2.16b, v0.4b[2]\n" - ".inst 0x6fa0e851 // udot v17.4s, v2.16b, v0.4b[3]\n" - ".inst 0x6f81e054 // udot v20.4s, v2.16b, v1.4b[0]\n" - ".inst 0x6fa1e057 // udot v23.4s, v2.16b, v1.4b[1]\n" - ".inst 0x6f81e85a // udot v26.4s, v2.16b, v1.4b[2]\n" - ".inst 0x6fa1e85d // udot v29.4s, v2.16b, v1.4b[3]\n" - ".inst 0x6f80e069 // udot v9.4s, v3.16b, v0.4b[0]\n" - ".inst 0x6fa0e06c // udot v12.4s, v3.16b, v0.4b[1]\n" - ".inst 0x6f80e86f // udot v15.4s, v3.16b, v0.4b[2]\n" - ".inst 0x6fa0e872 // udot v18.4s, v3.16b, v0.4b[3]\n" - ".inst 0x6f81e075 // udot v21.4s, v3.16b, v1.4b[0]\n" - ".inst 0x6fa1e078 // udot v24.4s, v3.16b, v1.4b[1]\n" - ".inst 0x6f81e87b // udot v27.4s, v3.16b, v1.4b[2]\n" - ".inst 0x6fa1e87e // udot v30.4s, v3.16b, v1.4b[3]\n" - ".inst 0x6f80e08a // udot v10.4s, v4.16b, v0.4b[0]\n" - ".inst 0x6fa0e08d // udot v13.4s, v4.16b, v0.4b[1]\n" - ".inst 0x6f80e890 // udot v16.4s, v4.16b, v0.4b[2]\n" - ".inst 0x6fa0e893 // udot v19.4s, v4.16b, v0.4b[3]\n" - ".inst 0x6f81e096 // udot v22.4s, v4.16b, v1.4b[0]\n" - ".inst 0x6fa1e099 // udot v25.4s, v4.16b, v1.4b[1]\n" - ".inst 0x6f81e89c // udot v28.4s, v4.16b, v1.4b[2]\n" - ".inst 0x6fa1e89f // udot v31.4s, v4.16b, v1.4b[3]\n" - "cbz x19, 5f\n" - "ldr q0, [%x[Apanel], #0x0]\n" - "ldr q1, [%x[Apanel], #0x10]\n" - "add %x[Apanel], %x[Apanel], #0x20\n" - "ldr q5, [x20, #0x0]\n" - "ldr q6, [x20, #0x10]\n" - ".inst 0x6f80e0a8 // udot v8.4s, v5.16b, v0.4b[0]\n" - "ldr q7, [x20, #0x20]\n" - ".inst 0x6fa0e0ab // udot v11.4s, v5.16b, v0.4b[1]\n" - ".inst 0x6f80e8ae // udot v14.4s, v5.16b, v0.4b[2]\n" - "add x20, x20, #0x30\n" - ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n" - ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n" - ".inst 0x6fa1e0b7 // udot v23.4s, v5.16b, v1.4b[1]\n" - ".inst 0x6f81e8ba // udot v26.4s, v5.16b, v1.4b[2]\n" - ".inst 0x6fa1e8bd // udot v29.4s, v5.16b, v1.4b[3]\n" - ".inst 0x6f80e0c9 // udot v9.4s, v6.16b, v0.4b[0]\n" - ".inst 0x6fa0e0cc // udot v12.4s, v6.16b, v0.4b[1]\n" - ".inst 0x6f80e8cf // udot v15.4s, v6.16b, v0.4b[2]\n" - ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n" - ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n" - ".inst 0x6fa1e0d8 // udot v24.4s, v6.16b, v1.4b[1]\n" - ".inst 0x6f81e8db // udot v27.4s, v6.16b, v1.4b[2]\n" - ".inst 0x6fa1e8de // udot v30.4s, v6.16b, v1.4b[3]\n" - ".inst 0x6f80e0ea // udot v10.4s, v7.16b, v0.4b[0]\n" - ".inst 0x6fa0e0ed // udot v13.4s, v7.16b, v0.4b[1]\n" - ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n" - ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n" - ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n" - ".inst 0x6fa1e0f9 // udot v25.4s, v7.16b, v1.4b[1]\n" - ".inst 0x6f81e8fc // udot v28.4s, v7.16b, v1.4b[2]\n" - ".inst 0x6fa1e8ff // udot v31.4s, v7.16b, v1.4b[3]\n" - "5:" // multiply loop done - "subs x22, x22, #0x1\n" - "str q8, [%x[Cpanel], #0x0]\n" - "str q9, [%x[Cpanel], #0x10]\n" - "str q10, [%x[Cpanel], #0x20]\n" - "str q11, [%x[Cpanel], #0x30]\n" - "str q12, [%x[Cpanel], #0x40]\n" - "str q13, [%x[Cpanel], #0x50]\n" - "str q14, [%x[Cpanel], #0x60]\n" - "str q15, [%x[Cpanel], #0x70]\n" - "str q16, [%x[Cpanel], #0x80]\n" - "str q17, [%x[Cpanel], #0x90]\n" - "str q18, [%x[Cpanel], #0xa0]\n" - "str q19, [%x[Cpanel], #0xb0]\n" - "str q20, [%x[Cpanel], #0xc0]\n" - "str q21, [%x[Cpanel], #0xd0]\n" - "str q22, [%x[Cpanel], #0xe0]\n" - "str q23, [%x[Cpanel], #0xf0]\n" - "str q24, [%x[Cpanel], #0x100]\n" - "str q25, [%x[Cpanel], #0x110]\n" - "str q26, [%x[Cpanel], #0x120]\n" - "str q27, [%x[Cpanel], #0x130]\n" - "str q28, [%x[Cpanel], #0x140]\n" - "str q29, [%x[Cpanel], #0x150]\n" - "str q30, [%x[Cpanel], #0x160]\n" - "str q31, [%x[Cpanel], #0x170]\n" - "add %x[Cpanel], %x[Cpanel], #0x180\n" - "bgt 2b\n" - "subs %x[ablocks], %x[ablocks], #0x1\n" - "bne 1b\n" - : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks) - : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22" - ); -} - -} // namespace arm_gemm -#endif // __aarch64__ -- cgit v1.2.1