aboutsummaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-03-01 19:07:11 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-03-19 11:53:45 +0000
commit146138378c1587b7297d245b7177641315f6180b (patch)
tree4689218b48a0884418099f373015429f2845ceaf /src/core
parent3dd5b6884a65c06bcb9d15589ee2dc2978e3b336 (diff)
downloadComputeLibrary-146138378c1587b7297d245b7177641315f6180b.tar.gz
COMPMID-1995: Update RSH GEMM assembly kernels.
-Updates u8/s8 hybrid dot product kernels to work for any N and any K >=16. -Adds hybrid FP32 kernels with generic and A55 variants. -Adds SVE native kernels for fp16/u8/s8. Change-Id: Ifc0eaba9e3c8ea5bb19d334e870e1b39e4e7e728 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/863 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp51
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp12
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp1
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int16.cpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int8.cpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp1
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_batched.hpp4
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp1
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp77
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp2352
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp1726
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp4581
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp3289
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp4581
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp3289
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp3681
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp2150
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp2150
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp74
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp3821
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp1875
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp1875
29 files changed, 26537 insertions, 9320 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index b56165927d..0927123f7c 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -28,49 +28,76 @@
#include "arm_gemm.hpp"
#include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
+#include "kernels/a32_sgemm_8x6.hpp"
#include "kernels/a64_hgemm_24x8.hpp"
#include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/sve_hybrid_fp16_mla_4VLx4.hpp"
#include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
+#include "kernels/sve_native_fp16_mla_4VLx4.hpp"
namespace arm_gemm {
static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
#if defined(__ARM_FEATURE_SVE)
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_fp16_mla_4VLx4",
+ [](const GemmArgs<__fp16> &args) { return (args._Ksize >= 8) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+ [](const GemmArgs<__fp16> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<__fp16> &args) { return new GemmHybrid<hybrid_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+},
+{
+ GemmMethod::GEMM_NATIVE,
+ "native_fp16_mla_4VLx4",
+ [](const GemmArgs<__fp16> &args) { return (args._Ksize >= 8 && args._alpha==1.0f && !args._trA && !args._trB); },
+ [](const GemmArgs<__fp16> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<__fp16> &args) { return new GemmNative<native_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+},
+{
GemmMethod::GEMM_INTERLEAVED,
"interleaved_fp16_mla_3VLx8",
[](const GemmArgs<__fp16> &args) { return (args._Ksize > 4); },
- [](const GemmArgs<__fp16> &args) { return true; },
+ nullptr,
[](const GemmArgs<__fp16> &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
},
#endif
+
#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
{
GemmMethod::GEMM_INTERLEAVED,
"hgemm_24x8",
- [](const GemmArgs<__fp16> &args) {
#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- return args._ci->has_fp16();
+ [](const GemmArgs<__fp16> &args) { return args._ci->has_fp16(); },
#else
- return true;
+ nullptr,
#endif
- },
- [](const GemmArgs<__fp16> &args) { return true; },
+ nullptr,
[](const GemmArgs<__fp16> &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
},
#endif
-#if defined(__arm__)
+#ifdef __aarch64__
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "sgemm_12x8",
+ nullptr,
+ nullptr,
+ [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args); }
+},
+#elif defined(__arm__)
{
GemmMethod::GEMM_INTERLEAVED,
"sgemm_8x6",
- [](const GemmArgs<__fp16> &args) { return true; },
- [](const GemmArgs<__fp16> &args) { return true; },
+ nullptr,
+ nullptr,
[](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args); }
},
+#else // not AArch64 or AArch32
+# error Unknown Architecture
#endif
{
GemmMethod::DEFAULT,
@@ -90,8 +117,8 @@ const GemmImplementation<__fp16, __fp16> *gemm_implementation_list<__fp16, __fp1
template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(const GemmArgs<__fp16> &args);
template KernelDescription get_gemm_method<__fp16, __fp16>(const GemmArgs<__fp16> &args);
template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, const GemmArgs<__fp16> &args);
-template std::vector<std::string> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
} // namespace arm_gemm
-#endif // __ARM_FP16_ARGS \ No newline at end of file
+#endif // __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 8bc33ccb69..6869279bb9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -32,6 +32,7 @@
#include "gemv_pretransposed.hpp"
#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_hybrid_fp32_mla_16x4.hpp"
#include "kernels/a64_sgemm_12x8.hpp"
#include "kernels/a64_sgemm_native_16x4.hpp"
#include "kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp"
@@ -112,6 +113,13 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
[](const GemmArgs<float> &args) { return new GemmHybrid<sgemm_nativeA_pretransposeB_16x4, float, float>(args); }
},
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_fp32_mla_16x4",
+ [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+ [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<float> &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
+},
+{
GemmMethod::GEMM_NATIVE,
"sgemm_native_16x4",
[](const GemmArgs<float> &args) { return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB); },
@@ -165,6 +173,6 @@ const GemmImplementation<float, float> *gemm_implementation_list<float, float>()
template UniqueGemmCommon<float, float> gemm<float, float>(const GemmArgs<float> &args);
template KernelDescription get_gemm_method<float, float>(const GemmArgs<float> &args);
template bool method_is_compatible<float, float>(GemmMethod method, const GemmArgs<float> &args);
-template std::vector<std::string> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
+template std::vector<KernelDescription> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
-} // namespace arm_gemm \ No newline at end of file
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index c2bd0bb882..82e0625b68 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -208,7 +208,6 @@ public:
return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
}
- using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
_B_transposed = buffer;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index bf80784b79..d952140959 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -112,8 +112,12 @@ bool find_implementation(const GemmArgs<Tret> &args, const GemmImplementation<To
}
template<typename Top, typename Tret>
-std::vector<std::string> get_compatible_kernels(const GemmArgs<Tret> &args) {
- std::vector<std::string> res;
+std::vector<KernelDescription> get_compatible_kernels(const GemmArgs<Tret> &args) {
+ std::vector<KernelDescription> res;
+
+ /* Find out what the default implementation in so we can set the flag accordingly later. */
+ const GemmImplementation<Top, Tret> *default_impl;
+ find_implementation(args, default_impl);
auto gemms = gemm_implementation_list<Top, Tret>();
@@ -123,7 +127,7 @@ std::vector<std::string> get_compatible_kernels(const GemmArgs<Tret> &args) {
continue;
}
- res.push_back(i->name);
+ res.push_back(KernelDescription(i->method, i->name, i==default_impl));
}
return res;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index b4503dd6a2..0db0654f81 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -58,7 +58,7 @@ const GemmImplementation<int16_t, int32_t> *gemm_implementation_list<int16_t, in
template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const GemmArgs<int32_t> &args);
template KernelDescription get_gemm_method<int16_t, int32_t>(const GemmArgs<int32_t> &args);
template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
-template std::vector<std::string> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 5811c2a1ce..9e49df1c28 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,6 +34,7 @@
#include "kernels/a64_gemm_s8_12x8.hpp"
#include "kernels/a64_gemm_s8_4x4.hpp"
#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
+#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
#include "kernels/sve_native_s8s32_dot_4VLx4.hpp"
@@ -42,6 +43,13 @@ namespace arm_gemm {
static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
#ifdef __ARM_FEATURE_SVE
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_s8s32_dot_4VLx4",
+ [](const GemmArgs<int32_t> &args) { return args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<int32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+},
+{
GemmMethod::GEMM_NATIVE,
"native_s8s32_dot_4VLx4",
[](const GemmArgs<int32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
@@ -59,7 +67,7 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
{
GemmMethod::GEMM_HYBRID,
"hybrid_s8s32_dot_16x4",
- [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
[](const GemmArgs<int32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
[](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
},
@@ -95,7 +103,7 @@ const GemmImplementation<int8_t, int32_t> *gemm_implementation_list<int8_t, int3
template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const GemmArgs<int32_t> &args);
template KernelDescription get_gemm_method<int8_t, int32_t>(const GemmArgs<int32_t> &args);
template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
-template std::vector<std::string> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index b83ccd3407..a7731666ec 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -480,7 +480,6 @@ public:
return total;
}
- using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
blockwalker current(*this);
Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 6bcbca9e8b..9e3e4e43b3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -58,7 +58,7 @@ const GemmImplementation<uint16_t, uint32_t> *gemm_implementation_list<uint16_t,
template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
template KernelDescription get_gemm_method<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
-template std::vector<std::string> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index b95ca8016b..9321bfccfd 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,6 +34,7 @@
#include "kernels/a64_gemm_u8_12x8.hpp"
#include "kernels/a64_gemm_u8_4x4.hpp"
#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
+#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
#include "kernels/sve_native_u8u32_dot_4VLx4.hpp"
@@ -42,6 +43,13 @@ namespace arm_gemm {
static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
#ifdef __ARM_FEATURE_SVE
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_u8u32_dot_4VLx4",
+ [](const GemmArgs<uint32_t> &args) { return args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<uint32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+},
+{
GemmMethod::GEMM_NATIVE,
"native_u8u32_dot_4VLx4",
[](const GemmArgs<uint32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
@@ -59,7 +67,7 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
{
GemmMethod::GEMM_HYBRID,
"hybrid_u8u32_dot_16x4",
- [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
[](const GemmArgs<uint32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
[](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
},
@@ -95,7 +103,7 @@ const GemmImplementation<uint8_t, uint32_t> *gemm_implementation_list<uint8_t, u
template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
template KernelDescription get_gemm_method<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
-template std::vector<std::string> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index 32d668f66d..b7f9de85c4 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -44,10 +44,9 @@ public:
_subgemm = gemm<To,Tr>(newargs);
}
- using GemmCommon<To, Tr>::set_arrays;
void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
const To *B, const int ldb, const int B_multi_stride,
- Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
+ Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
/* A and C's batch stride becomes their new row stride. New batch stride is 0 as nbatches for subgemm is always 1. */
_subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride,
B, ldb, B_multi_stride,
@@ -86,7 +85,6 @@ public:
return _subgemm->get_B_pretransposed_array_size();
}
- using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
_subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index f7beb0a34c..21f8278529 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -148,7 +148,6 @@ public:
return _buffer_per_multi * _nmultis * sizeof(To);
}
- using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
new file mode 100644
index 0000000000..560593958a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, float, int, int, int);
+void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, float, int, int, int);
+
+class hybrid_fp32_mla_16x4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_fp32_mla_16x4;
+
+ hybrid_fp32_mla_16x4(const CPUInfo *ci)
+ {
+ if (ci->get_cpu_model() == CPUModel::A55r1) {
+ kernel = a64_hybrid_fp32_mla_16x4_a55;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
new file mode 100644
index 0000000000..7261761d7e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
@@ -0,0 +1,2352 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const int K_stride = K;
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long blocks_count = K / 1;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const float *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long blocks = blocks_count;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ float result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+ float *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "fmul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "fmul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "fmul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "fmul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "fmul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "fmul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "fmul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "fmul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v28.4s, v12.4s, v7.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "fmla v29.4s, v13.4s, v7.s[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ "fmla v30.4s, v14.4s, v7.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v7.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v28.4s, v12.4s, v7.s[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v29.4s, v13.4s, v7.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v28.4s, v12.4s, v7.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v29.4s, v13.4s, v7.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v30.4s, v14.4s, v7.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v7.s[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v28.4s, v12.4s, v7.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v29.4s, v13.4s, v7.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ );
+ break;
+ }
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
new file mode 100644
index 0000000000..504769b9f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
@@ -0,0 +1,1726 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const int K_stride = K;
+ const long loops_count = ((K + 4) / 8) - 1;
+ K -= loops_count * 8;
+ const long regs_count = (K / 4) - 1;
+ K -= (regs_count + 1) * 4;
+ const long blocks_count = K / 1;
+
+ for (int y=0; y<M; y+=4) {
+ const float * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(float);
+
+ float *c_ptr0 = C + (y * ldc);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const float *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long blocks = blocks_count;
+ const float *a_ptr0 = a_ptr0_base;
+ const float *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ float result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+ float *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "fmul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "fmul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "fmul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "fmul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "fmul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "fmul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "fmul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "fmul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "fmul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "fmul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "fmul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "fmul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "fmul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "fmul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "fmul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "fmul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "fmul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "fmul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "fmla v28.4s, v12.4s, v7.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "fmla v29.4s, v13.4s, v7.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "fmla v30.4s, v14.4s, v7.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "fmla v31.4s, v15.4s, v7.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v28.4s, v12.4s, v7.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v29.4s, v13.4s, v7.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "fmla v16.4s, v8.4s, v4.s[0]\n"
+ "fmla v20.4s, v8.4s, v5.s[0]\n"
+ "fmla v24.4s, v8.4s, v6.s[0]\n"
+ "fmla v28.4s, v8.4s, v7.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v4.s[0]\n"
+ "fmla v21.4s, v9.4s, v5.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[0]\n"
+ "fmla v29.4s, v9.4s, v7.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v4.s[0]\n"
+ "fmla v22.4s, v10.4s, v5.s[0]\n"
+ "fmla v26.4s, v10.4s, v6.s[0]\n"
+ "fmla v30.4s, v10.4s, v7.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v4.s[0]\n"
+ "fmla v23.4s, v11.4s, v5.s[0]\n"
+ "fmla v27.4s, v11.4s, v6.s[0]\n"
+ "fmla v31.4s, v11.4s, v7.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v4.s[1]\n"
+ "fmla v20.4s, v12.4s, v5.s[1]\n"
+ "fmla v24.4s, v12.4s, v6.s[1]\n"
+ "fmla v28.4s, v12.4s, v7.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v4.s[1]\n"
+ "fmla v21.4s, v13.4s, v5.s[1]\n"
+ "fmla v25.4s, v13.4s, v6.s[1]\n"
+ "fmla v29.4s, v13.4s, v7.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v4.s[1]\n"
+ "fmla v22.4s, v14.4s, v5.s[1]\n"
+ "fmla v26.4s, v14.4s, v6.s[1]\n"
+ "fmla v30.4s, v14.4s, v7.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v4.s[1]\n"
+ "fmla v23.4s, v15.4s, v5.s[1]\n"
+ "fmla v27.4s, v15.4s, v6.s[1]\n"
+ "fmla v31.4s, v15.4s, v7.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v4.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v5.s[2]\n"
+ "fmla v24.4s, v8.4s, v6.s[2]\n"
+ "fmla v28.4s, v8.4s, v7.s[2]\n"
+ "fmla v17.4s, v9.4s, v4.s[2]\n"
+ "fmla v21.4s, v9.4s, v5.s[2]\n"
+ "fmla v25.4s, v9.4s, v6.s[2]\n"
+ "fmla v29.4s, v9.4s, v7.s[2]\n"
+ "fmla v18.4s, v10.4s, v4.s[2]\n"
+ "fmla v22.4s, v10.4s, v5.s[2]\n"
+ "fmla v26.4s, v10.4s, v6.s[2]\n"
+ "fmla v30.4s, v10.4s, v7.s[2]\n"
+ "fmla v19.4s, v11.4s, v4.s[2]\n"
+ "fmla v23.4s, v11.4s, v5.s[2]\n"
+ "fmla v27.4s, v11.4s, v6.s[2]\n"
+ "fmla v31.4s, v11.4s, v7.s[2]\n"
+ "fmla v16.4s, v12.4s, v4.s[3]\n"
+ "fmla v20.4s, v12.4s, v5.s[3]\n"
+ "fmla v24.4s, v12.4s, v6.s[3]\n"
+ "fmla v28.4s, v12.4s, v7.s[3]\n"
+ "fmla v17.4s, v13.4s, v4.s[3]\n"
+ "fmla v21.4s, v13.4s, v5.s[3]\n"
+ "fmla v25.4s, v13.4s, v6.s[3]\n"
+ "fmla v29.4s, v13.4s, v7.s[3]\n"
+ "fmla v18.4s, v14.4s, v4.s[3]\n"
+ "fmla v22.4s, v14.4s, v5.s[3]\n"
+ "fmla v26.4s, v14.4s, v6.s[3]\n"
+ "fmla v30.4s, v14.4s, v7.s[3]\n"
+ "fmla v19.4s, v15.4s, v4.s[3]\n"
+ "fmla v23.4s, v15.4s, v5.s[3]\n"
+ "fmla v27.4s, v15.4s, v6.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "fmla v16.4s, v12.4s, v0.s[1]\n"
+ "fmla v20.4s, v12.4s, v1.s[1]\n"
+ "fmla v24.4s, v12.4s, v2.s[1]\n"
+ "fmla v28.4s, v12.4s, v3.s[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "fmla v17.4s, v13.4s, v0.s[1]\n"
+ "fmla v21.4s, v13.4s, v1.s[1]\n"
+ "fmla v25.4s, v13.4s, v2.s[1]\n"
+ "fmla v29.4s, v13.4s, v3.s[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "fmla v18.4s, v14.4s, v0.s[1]\n"
+ "fmla v22.4s, v14.4s, v1.s[1]\n"
+ "fmla v26.4s, v14.4s, v2.s[1]\n"
+ "fmla v30.4s, v14.4s, v3.s[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "fmla v19.4s, v15.4s, v0.s[1]\n"
+ "fmla v23.4s, v15.4s, v1.s[1]\n"
+ "fmla v27.4s, v15.4s, v2.s[1]\n"
+ "fmla v31.4s, v15.4s, v3.s[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ "fmla v16.4s, v8.4s, v0.s[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "fmla v20.4s, v8.4s, v1.s[2]\n"
+ "fmla v24.4s, v8.4s, v2.s[2]\n"
+ "fmla v28.4s, v8.4s, v3.s[2]\n"
+ "fmla v17.4s, v9.4s, v0.s[2]\n"
+ "fmla v21.4s, v9.4s, v1.s[2]\n"
+ "fmla v25.4s, v9.4s, v2.s[2]\n"
+ "fmla v29.4s, v9.4s, v3.s[2]\n"
+ "fmla v18.4s, v10.4s, v0.s[2]\n"
+ "fmla v22.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v30.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v11.4s, v0.s[2]\n"
+ "fmla v23.4s, v11.4s, v1.s[2]\n"
+ "fmla v27.4s, v11.4s, v2.s[2]\n"
+ "fmla v31.4s, v11.4s, v3.s[2]\n"
+ "fmla v16.4s, v12.4s, v0.s[3]\n"
+ "fmla v20.4s, v12.4s, v1.s[3]\n"
+ "fmla v24.4s, v12.4s, v2.s[3]\n"
+ "fmla v28.4s, v12.4s, v3.s[3]\n"
+ "fmla v17.4s, v13.4s, v0.s[3]\n"
+ "fmla v21.4s, v13.4s, v1.s[3]\n"
+ "fmla v25.4s, v13.4s, v2.s[3]\n"
+ "fmla v29.4s, v13.4s, v3.s[3]\n"
+ "fmla v18.4s, v14.4s, v0.s[3]\n"
+ "fmla v22.4s, v14.4s, v1.s[3]\n"
+ "fmla v26.4s, v14.4s, v2.s[3]\n"
+ "fmla v30.4s, v14.4s, v3.s[3]\n"
+ "fmla v19.4s, v15.4s, v0.s[3]\n"
+ "fmla v23.4s, v15.4s, v1.s[3]\n"
+ "fmla v27.4s, v15.4s, v2.s[3]\n"
+ "fmla v31.4s, v15.4s, v3.s[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "fmla v16.4s, v8.4s, v0.s[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ "fmla v17.4s, v9.4s, v0.s[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ "fmla v18.4s, v10.4s, v0.s[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ "fmla v20.4s, v8.4s, v1.s[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ "fmla v21.4s, v9.4s, v1.s[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ "fmla v24.4s, v8.4s, v2.s[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ "fmla v25.4s, v9.4s, v2.s[0]\n"
+ "fmla v28.4s, v8.4s, v3.s[0]\n"
+ "fmla v29.4s, v9.4s, v3.s[0]\n"
+ "fmla v22.4s, v10.4s, v1.s[0]\n"
+ "fmla v26.4s, v10.4s, v2.s[0]\n"
+ "fmla v30.4s, v10.4s, v3.s[0]\n"
+ "fmla v19.4s, v11.4s, v0.s[0]\n"
+ "fmla v23.4s, v11.4s, v1.s[0]\n"
+ "fmla v27.4s, v11.4s, v2.s[0]\n"
+ "fmla v31.4s, v11.4s, v3.s[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
index 48bf842ca5..17f6e578f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
@@ -37,2235 +37,2432 @@ void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, in
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
for (int y=0; y<M; y+=4) {
const int8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(int8_t);
int32_t *c_ptr0 = C + (y * ldc);
- const unsigned long ldcb = ldc * sizeof(int32_t);
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
const int32_t *betaptr = &beta;
long loops = loops_count;
long regs = regs_count;
+ long blocks = blocks_count;
+ long odds = odds_count;
const int8_t *a_ptr0 = a_ptr0_base;
const int8_t *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ int32_t result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
+ int32_t *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
switch(M-y) {
case 1:
__asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
case 2:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
case 3:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ins v15.d[1], temploadreg3\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d2, [a_ptr2, #0x10]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #0x18]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
);
break;
default:
case 4:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v28.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v29.4s, #0\n"
- "ins v14.d[1], temploadreg2\n"
- "movi v30.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v31.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q28, [c_ptr3]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "mul v28.4s, v28.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v29.4s, v29.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v30.4s, v30.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v31.4s, v31.4s, v15.4s\n"
- "ldr q3, [a_ptr3]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v2.d[1], temploadreg2\n"
- ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v3.d[1], temploadreg3\n"
- ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d2, [a_ptr2, #0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x18]\n"
- ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr d3, [a_ptr3, #0x10]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x18]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v2.d[1], temploadreg2\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v3.d[1], temploadreg3\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "ld1 {v3.b}[0], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "ld1 {v3.b}[1], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "ld1 {v3.b}[2], [a_ptr3]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
);
break;
}
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
}
}
}
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
index 01791391c8..fdd45a03cf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
@@ -37,1569 +37,1806 @@ void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
for (int y=0; y<M; y+=4) {
const int8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(int8_t);
int32_t *c_ptr0 = C + (y * ldc);
- const unsigned long ldcb = ldc * sizeof(int32_t);
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
const int32_t *betaptr = &beta;
long loops = loops_count;
long regs = regs_count;
+ long blocks = blocks_count;
+ long odds = odds_count;
const int8_t *a_ptr0 = a_ptr0_base;
const int8_t *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ int32_t result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
+ int32_t *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
switch(M-y) {
case 1:
__asm __volatile (
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
);
break;
case 2:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
);
break;
case 3:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
default:
case 4:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v28.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v29.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "movi v30.4s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "movi v31.4s, #0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q28, [c_ptr3]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "mul v28.4s, v28.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v29.4s, v29.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v30.4s, v30.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v31.4s, v31.4s, v15.4s\n"
- "ldr q3, [a_ptr3]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #0x10]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #0x10]\n"
- ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "ld1 {v3.b}[0], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "ld1 {v3.b}[1], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "ld1 {v3.b}[2], [a_ptr3]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
}
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
}
}
}
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
index 230ecdce2d..487cfa08e3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
@@ -37,2235 +37,2432 @@ void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B,
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
for (int y=0; y<M; y+=4) {
const uint8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(uint8_t);
uint32_t *c_ptr0 = C + (y * ldc);
- const unsigned long ldcb = ldc * sizeof(uint32_t);
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
const uint32_t *betaptr = &beta;
long loops = loops_count;
long regs = regs_count;
+ long blocks = blocks_count;
+ long odds = odds_count;
const uint8_t *a_ptr0 = a_ptr0_base;
const uint8_t *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ uint32_t result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
+ uint32_t *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
switch(M-y) {
case 1:
__asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
case 2:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
case 3:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ins v15.d[1], temploadreg3\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d2, [a_ptr2, #0x10]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #0x18]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
);
break;
default:
case 4:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v28.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v29.4s, #0\n"
- "ins v14.d[1], temploadreg2\n"
- "movi v30.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v31.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q28, [c_ptr3]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "mul v28.4s, v28.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v29.4s, v29.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v30.4s, v30.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v31.4s, v31.4s, v15.4s\n"
- "ldr q3, [a_ptr3]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v2.d[1], temploadreg2\n"
- ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v3.d[1], temploadreg3\n"
- ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d0, [%[a_ptr0], #0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d1, [a_ptr1, #0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [a_ptr1, #0x18]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d2, [a_ptr2, #0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x18]\n"
- ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr d3, [a_ptr3, #0x10]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x18]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v2.d[1], temploadreg2\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v3.d[1], temploadreg3\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "ld1 {v3.b}[0], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "ld1 {v3.b}[1], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "ld1 {v3.b}[2], [a_ptr3]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
);
break;
}
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
}
}
}
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
index dbef02985f..87f46bb261 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
@@ -37,1569 +37,1806 @@ void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint
const long loops_count = ((K + 16) / 32) - 1;
K -= loops_count * 32;
const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long blocks_count = K / 4;
+ const long odds_count = K - (blocks_count * 4);
for (int y=0; y<M; y+=4) {
const uint8_t * const a_ptr0_base = A + (y * lda);
const unsigned long ldab = lda * sizeof(uint8_t);
uint32_t *c_ptr0 = C + (y * ldc);
- const unsigned long ldcb = ldc * sizeof(uint32_t);
for (int x0=0; x0<N; x0+=16ul) {
const long width = std::min((unsigned long)N-x0, 16ul);
const uint32_t *betaptr = &beta;
long loops = loops_count;
long regs = regs_count;
+ long blocks = blocks_count;
+ long odds = odds_count;
const uint8_t *a_ptr0 = a_ptr0_base;
const uint8_t *b_ptr0 = B + (K_stride * x0);
+ const bool use_result_buffer = (width < 16);
+ uint32_t result_buffer[64];
+ const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
+ uint32_t *c_ptr_real = c_ptr0;
+ if (use_result_buffer && !beta0) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+ }
+ }
+ }
+ if (use_result_buffer) {
+ c_ptr0 = result_buffer;
+ }
switch(M-y) {
case 1:
__asm __volatile (
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
);
break;
case 2:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
);
break;
case 3:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
default:
case 4:
__asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbz %[beta0], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v28.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v29.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "movi v30.4s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "movi v31.4s, #0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1r {v15.4s}, [%[betaptr]]\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "mul v16.4s, v16.4s, v15.4s\n"
- "ldr q20, [c_ptr1]\n"
- "mul v17.4s, v17.4s, v15.4s\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "mul v18.4s, v18.4s, v15.4s\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "mul v19.4s, v19.4s, v15.4s\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "mul v20.4s, v20.4s, v15.4s\n"
- "ldr q24, [c_ptr2]\n"
- "mul v21.4s, v21.4s, v15.4s\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "mul v22.4s, v22.4s, v15.4s\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "mul v23.4s, v23.4s, v15.4s\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "mul v24.4s, v24.4s, v15.4s\n"
- "ldr q28, [c_ptr3]\n"
- "mul v25.4s, v25.4s, v15.4s\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "mul v26.4s, v26.4s, v15.4s\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "mul v27.4s, v27.4s, v15.4s\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "mul v28.4s, v28.4s, v15.4s\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mul v29.4s, v29.4s, v15.4s\n"
- "ldr q1, [a_ptr1]\n"
- "mul v30.4s, v30.4s, v15.4s\n"
- "ldr q2, [a_ptr2]\n"
- "mul v31.4s, v31.4s, v15.4s\n"
- "ldr q3, [a_ptr3]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #0x10]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #0x10]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #0x10]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #0x10]\n"
- ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "cbz %[blocks], 6f\n"
+ "7:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr s0, [%[a_ptr0]]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x4\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr s1, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x4\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr s2, [a_ptr2]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x4\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr s3, [a_ptr3]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x4\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "b.ne 7b\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
+ "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[0], [a_ptr1], #1\n"
+ "ld1 {v2.b}[0], [a_ptr2], #1\n"
+ "ld1 {v3.b}[0], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+ "ld1 {v1.b}[1], [a_ptr1], #1\n"
+ "ld1 {v2.b}[1], [a_ptr2], #1\n"
+ "ld1 {v3.b}[1], [a_ptr3], #1\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "b.eq 9f\n"
+ "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+ "ld1 {v1.b}[2], [a_ptr1]\n"
+ "ld1 {v2.b}[2], [a_ptr2]\n"
+ "ld1 {v3.b}[2], [a_ptr3]\n"
+ "9:\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "8:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
}
+ if (use_result_buffer) {
+ for(int cy=0; cy<std::min(M-y, 4); cy++) {
+ for(unsigned int cx=0; cx<width; cx++) {
+ c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+ }
+ }
+ }
}
}
}
} // namespace arm_gemm
-#endif // __aarch64__
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
new file mode 100644
index 0000000000..c6895a6a0e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, __fp16 *, int, __fp16, int, int, int);
+
+class hybrid_fp16_mla_4VLx4
+{
+public:
+ typedef __fp16 operand_type;
+ typedef __fp16 result_type;
+
+ typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, __fp16 *, int, __fp16, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<__fp16>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
+
+ hybrid_fp16_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000000..ab41fb3743
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
@@ -0,0 +1,3681 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, __fp16 beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const int K_stride = K;
+ const long loops_count = ((K + 8) / 16) - 1;
+ K -= loops_count * 16;
+ const long regs_count = (K / 8) - 1;
+ K -= (regs_count + 1) * 8;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const __fp16 * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(__fp16);
+
+ __fp16 *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(__fp16);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
+ const __fp16 *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const __fp16 *a_ptr0 = a_ptr0_base;
+ const __fp16 *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "mov z18.h, #0\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.h, #0\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "mov z19.h, #0\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.h, #0\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.h, #0\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "mov z19.h, #0\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "mov z20.h, #0\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.h, #0\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z24.h, #0\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z25.h, #0\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z26.h, #0\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z27.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1h z24.h, p0/z, [c_ptr2]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z24.h, p7/m, z24.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z25.h, p7/m, z25.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z26.h, p7/m, z26.h, z15.h\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "fmul z27.h, p7/m, z27.h, z15.h\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1h z24.h, p0, [c_ptr2]\n"
+ "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "mov z19.h, #0\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+ "mov z20.h, #0\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "mov z21.h, #0\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.h, #0\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z25.h, #0\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z26.h, #0\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z27.h, #0\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z28.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z29.h, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z30.h, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z31.h, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1h z24.h, p0/z, [c_ptr2]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z24.h, p7/m, z24.h, z15.h\n"
+ "ld1h z28.h, p0/z, [c_ptr3]\n"
+ "fmul z25.h, p7/m, z25.h, z15.h\n"
+ "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z26.h, p7/m, z26.h, z15.h\n"
+ "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z27.h, p7/m, z27.h, z15.h\n"
+ "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z28.h, p7/m, z28.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z29.h, p7/m, z29.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z30.h, p7/m, z30.h, z15.h\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "fmul z31.h, p7/m, z31.h, z15.h\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z28.h, z12.h, z7.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z29.h, z13.h, z7.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z30.h, z14.h, z7.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "fmla z31.h, z15.h, z7.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z28.h, z12.h, z7.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z29.h, z13.h, z7.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z30.h, z14.h, z7.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "fmla z31.h, z15.h, z7.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "b.eq 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "b.eq 5f\n"
+ "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "b.eq 5f\n"
+ "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1h z24.h, p0, [c_ptr2]\n"
+ "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1h z28.h, p0, [c_ptr3]\n"
+ "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
new file mode 100644
index 0000000000..ffd7918b7a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+class hybrid_s8s32_dot_4VLx4
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<int32_t>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
+
+ hybrid_s8s32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000000..673f186524
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
@@ -0,0 +1,2150 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = (K + 3) / 4;
+
+ for (int y=0; y<M; y+=4) {
+ const int8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(int8_t);
+
+ int32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(int32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
+ const int32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ const int8_t *a_ptr0 = a_ptr0_base;
+ const int8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z28.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z29.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z30.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z31.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "sdot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "sdot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
+ "sdot z23.s, z11.b, z1.b[0]\n"
+ "sdot z27.s, z11.b, z2.b[0]\n"
+ "sdot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z24.s, z12.b, z2.b[1]\n"
+ "sdot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z17.s, z13.b, z0.b[1]\n"
+ "sdot z21.s, z13.b, z1.b[1]\n"
+ "sdot z25.s, z13.b, z2.b[1]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z26.s, z14.b, z2.b[1]\n"
+ "sdot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z27.s, z15.b, z2.b[1]\n"
+ "sdot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z24.s, z8.b, z2.b[2]\n"
+ "sdot z28.s, z8.b, z3.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z21.s, z9.b, z1.b[2]\n"
+ "sdot z25.s, z9.b, z2.b[2]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[2]\n"
+ "sdot z30.s, z10.b, z3.b[2]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "sdot z27.s, z11.b, z2.b[2]\n"
+ "sdot z31.s, z11.b, z3.b[2]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z24.s, z12.b, z2.b[3]\n"
+ "sdot z28.s, z12.b, z3.b[3]\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z21.s, z13.b, z1.b[3]\n"
+ "sdot z25.s, z13.b, z2.b[3]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z26.s, z14.b, z2.b[3]\n"
+ "sdot z30.s, z14.b, z3.b[3]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "sdot z27.s, z15.b, z2.b[3]\n"
+ "sdot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "sdot z24.s, z8.b, z6.b[0]\n"
+ "sdot z28.s, z8.b, z7.b[0]\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
+ "sdot z21.s, z9.b, z5.b[0]\n"
+ "sdot z25.s, z9.b, z6.b[0]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z26.s, z10.b, z6.b[0]\n"
+ "sdot z30.s, z10.b, z7.b[0]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "sdot z27.s, z11.b, z6.b[0]\n"
+ "sdot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "sdot z24.s, z12.b, z6.b[1]\n"
+ "sdot z28.s, z12.b, z7.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "sdot z25.s, z13.b, z6.b[1]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z26.s, z14.b, z6.b[1]\n"
+ "sdot z30.s, z14.b, z7.b[1]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z27.s, z15.b, z6.b[1]\n"
+ "sdot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z24.s, z8.b, z6.b[2]\n"
+ "sdot z28.s, z8.b, z7.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z21.s, z9.b, z5.b[2]\n"
+ "sdot z25.s, z9.b, z6.b[2]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z26.s, z10.b, z6.b[2]\n"
+ "sdot z30.s, z10.b, z7.b[2]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "sdot z27.s, z11.b, z6.b[2]\n"
+ "sdot z31.s, z11.b, z7.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
+ "sdot z24.s, z12.b, z6.b[3]\n"
+ "sdot z28.s, z12.b, z7.b[3]\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "sdot z25.s, z13.b, z6.b[3]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "sdot z26.s, z14.b, z6.b[3]\n"
+ "sdot z30.s, z14.b, z7.b[3]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "sdot z27.s, z15.b, z6.b[3]\n"
+ "sdot z31.s, z15.b, z7.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
new file mode 100644
index 0000000000..2701a9e939
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+class hybrid_u8u32_dot_4VLx4
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<uint32_t>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
+
+ hybrid_u8u32_dot_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000000..d34d0e5fc7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
@@ -0,0 +1,2150 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0u);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+ K -= (regs_count + 1) * 16;
+ const long leftovers = K;
+ const long blocks_count = (K + 3) / 4;
+
+ for (int y=0; y<M; y+=4) {
+ const uint8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(uint8_t);
+
+ uint32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
+ const uint32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = blocks_count;
+ const uint8_t *a_ptr0 = a_ptr0_base;
+ const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z18.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z19.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z20.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p6.b, %[temp], %[leftovers]\n"
+ "whilelt p0.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "ptrue p7.b\n"
+ "whilelt p1.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p2.s, %[temp], %[width]\n"
+ "incw %[temp], all, mul #1\n"
+ "whilelt p3.s, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z17.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z18.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z19.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z20.s, #0\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "mov z21.s, #0\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.s, #0\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.s, #0\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.s, #0\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "mov z25.s, #0\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "mov z26.s, #0\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "mov z27.s, #0\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "mov z28.s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z29.s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z30.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z31.s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+ "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+ "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "mul z16.s, p7/m, z16.s, z15.s\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
+ "mul z17.s, p7/m, z17.s, z15.s\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "mul z18.s, p7/m, z18.s, z15.s\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "mul z19.s, p7/m, z19.s, z15.s\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "mul z20.s, p7/m, z20.s, z15.s\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "mul z21.s, p7/m, z21.s, z15.s\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "mul z22.s, p7/m, z22.s, z15.s\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "mul z24.s, p7/m, z24.s, z15.s\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "mul z25.s, p7/m, z25.s, z15.s\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "mul z26.s, p7/m, z26.s, z15.s\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "mul z28.s, p7/m, z28.s, z15.s\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mul z29.s, p7/m, z29.s, z15.s\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mul z30.s, p7/m, z30.s, z15.s\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[0]\n"
+ "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+ "udot z28.s, z8.b, z3.b[0]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "udot z29.s, z9.b, z3.b[0]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z30.s, z10.b, z3.b[0]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
+ "udot z23.s, z11.b, z1.b[0]\n"
+ "udot z27.s, z11.b, z2.b[0]\n"
+ "udot z31.s, z11.b, z3.b[0]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z24.s, z12.b, z2.b[1]\n"
+ "udot z28.s, z12.b, z3.b[1]\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z17.s, z13.b, z0.b[1]\n"
+ "udot z21.s, z13.b, z1.b[1]\n"
+ "udot z25.s, z13.b, z2.b[1]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z26.s, z14.b, z2.b[1]\n"
+ "udot z30.s, z14.b, z3.b[1]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z27.s, z15.b, z2.b[1]\n"
+ "udot z31.s, z15.b, z3.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z24.s, z8.b, z2.b[2]\n"
+ "udot z28.s, z8.b, z3.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z21.s, z9.b, z1.b[2]\n"
+ "udot z25.s, z9.b, z2.b[2]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[2]\n"
+ "udot z30.s, z10.b, z3.b[2]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "udot z27.s, z11.b, z2.b[2]\n"
+ "udot z31.s, z11.b, z3.b[2]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z24.s, z12.b, z2.b[3]\n"
+ "udot z28.s, z12.b, z3.b[3]\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z21.s, z13.b, z1.b[3]\n"
+ "udot z25.s, z13.b, z2.b[3]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z26.s, z14.b, z2.b[3]\n"
+ "udot z30.s, z14.b, z3.b[3]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "udot z27.s, z15.b, z2.b[3]\n"
+ "udot z31.s, z15.b, z3.b[3]\n"
+ "cbz %[blocks], 5f\n"
+ "addvl %[b_ptr0], %[b_ptr0], #16\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "udot z24.s, z8.b, z6.b[0]\n"
+ "udot z28.s, z8.b, z7.b[0]\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
+ "udot z21.s, z9.b, z5.b[0]\n"
+ "udot z25.s, z9.b, z6.b[0]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z26.s, z10.b, z6.b[0]\n"
+ "udot z30.s, z10.b, z7.b[0]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "udot z27.s, z11.b, z6.b[0]\n"
+ "udot z31.s, z11.b, z7.b[0]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "udot z24.s, z12.b, z6.b[1]\n"
+ "udot z28.s, z12.b, z7.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "udot z25.s, z13.b, z6.b[1]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z26.s, z14.b, z6.b[1]\n"
+ "udot z30.s, z14.b, z7.b[1]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z27.s, z15.b, z6.b[1]\n"
+ "udot z31.s, z15.b, z7.b[1]\n"
+ "b.eq 5f\n"
+ "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z24.s, z8.b, z6.b[2]\n"
+ "udot z28.s, z8.b, z7.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z21.s, z9.b, z5.b[2]\n"
+ "udot z25.s, z9.b, z6.b[2]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z26.s, z10.b, z6.b[2]\n"
+ "udot z30.s, z10.b, z7.b[2]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "udot z27.s, z11.b, z6.b[2]\n"
+ "udot z31.s, z11.b, z7.b[2]\n"
+ "b.eq 5f\n"
+ "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
+ "udot z24.s, z12.b, z6.b[3]\n"
+ "udot z28.s, z12.b, z7.b[3]\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "udot z25.s, z13.b, z6.b[3]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "udot z26.s, z14.b, z6.b[3]\n"
+ "udot z30.s, z14.b, z7.b[3]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "udot z27.s, z15.b, z6.b[3]\n"
+ "udot z31.s, z15.b, z7.b[3]\n"
+ "5:\n"
+ "st1w z16.s, p0, [%[c_ptr0]]\n"
+ "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1w z20.s, p0, [c_ptr1]\n"
+ "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1w z24.s, p0, [c_ptr2]\n"
+ "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1w z28.s, p0, [c_ptr3]\n"
+ "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
index f4d33a9efa..8228df4a0f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
new file mode 100644
index 0000000000..6cce601dcc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, __fp16, int, int, int);
+
+class native_fp16_mla_4VLx4
+{
+public:
+ typedef __fp16 operand_type;
+ typedef __fp16 result_type;
+
+ typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, __fp16, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<__fp16>() * 4;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+
+
+ // Default to the generic kernel
+ kern_type kernel=sve_native_fp16_mla_4VLx4;
+
+ native_fp16_mla_4VLx4(const CPUInfo *ci)
+ {
+
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000000..f1aaeb13ee
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
@@ -0,0 +1,3821 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ldb, __fp16 *C, int ldc, __fp16 beta, int M, int N, int K) {
+ const long beta0 = (beta == 0.0f);
+ const long loops_count = ((K + 8) / 16) - 1;
+ K -= loops_count * 16;
+ const long regs_count = (K / 8) - 1;
+ K -= (regs_count + 1) * 8;
+ const long leftovers = K;
+
+ for (int y=0; y<M; y+=4) {
+ const __fp16 * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(__fp16);
+
+ __fp16 *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(__fp16);
+
+ for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
+ const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
+ const __fp16 *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ long temp = 0;
+ long blocks = leftovers;
+ const __fp16 *a_ptr0 = a_ptr0_base;
+ const __fp16 *b_ptr0 = B + x0;
+ long ldbb = ldb * sizeof(__fp16);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "mov z18.h, #0\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z19.h, #0\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "mov z19.h, #0\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z20.h, #0\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z21.h, #0\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z23.h, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "mov z19.h, #0\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "mov z20.h, #0\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z21.h, #0\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z24.h, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z25.h, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z26.h, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z27.h, #0\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1h z24.h, p0/z, [c_ptr2]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z24.h, p7/m, z24.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z25.h, p7/m, z25.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z26.h, p7/m, z26.h, z15.h\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "fmul z27.h, p7/m, z27.h, z15.h\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1h z24.h, p0, [c_ptr2]\n"
+ "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "whilelt p6.h, %[temp], %[leftovers]\n"
+ "whilelt p0.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "ptrue p7.h\n"
+ "whilelt p1.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p2.h, %[temp], %[width]\n"
+ "inch %[temp], all, mul #1\n"
+ "whilelt p3.h, %[temp], %[width]\n"
+ "cbz %[beta0], 1f\n"
+ "mov z16.h, #0\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "mov z17.h, #0\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "mov z18.h, #0\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "mov z19.h, #0\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+ "mov z20.h, #0\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "mov z21.h, #0\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z22.h, #0\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "mov z23.h, #0\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "mov z24.h, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "mov z25.h, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "mov z26.h, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "mov z27.h, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "mov z28.h, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "mov z29.h, #0\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "mov z30.h, #0\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "mov z31.h, #0\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+ "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+ "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+ "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+ "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+ "fmul z16.h, p7/m, z16.h, z15.h\n"
+ "ld1h z20.h, p0/z, [c_ptr1]\n"
+ "fmul z17.h, p7/m, z17.h, z15.h\n"
+ "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "fmul z18.h, p7/m, z18.h, z15.h\n"
+ "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "fmul z19.h, p7/m, z19.h, z15.h\n"
+ "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "fmul z20.h, p7/m, z20.h, z15.h\n"
+ "ld1h z24.h, p0/z, [c_ptr2]\n"
+ "fmul z21.h, p7/m, z21.h, z15.h\n"
+ "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "fmul z22.h, p7/m, z22.h, z15.h\n"
+ "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "fmul z23.h, p7/m, z23.h, z15.h\n"
+ "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "fmul z24.h, p7/m, z24.h, z15.h\n"
+ "ld1h z28.h, p0/z, [c_ptr3]\n"
+ "fmul z25.h, p7/m, z25.h, z15.h\n"
+ "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "fmul z26.h, p7/m, z26.h, z15.h\n"
+ "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "fmul z27.h, p7/m, z27.h, z15.h\n"
+ "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "fmul z28.h, p7/m, z28.h, z15.h\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+ "fmul z29.h, p7/m, z29.h, z15.h\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+ "fmul z30.h, p7/m, z30.h, z15.h\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+ "fmul z31.h, p7/m, z31.h, z15.h\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z28.h, z12.h, z7.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z29.h, z13.h, z7.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z30.h, z14.h, z7.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "fmla z31.h, z15.h, z7.h[7]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "fmla z16.h, z12.h, z4.h[7]\n"
+ "fmla z20.h, z12.h, z5.h[7]\n"
+ "fmla z24.h, z12.h, z6.h[7]\n"
+ "fmla z28.h, z12.h, z7.h[7]\n"
+ "fmla z17.h, z13.h, z4.h[7]\n"
+ "fmla z21.h, z13.h, z5.h[7]\n"
+ "fmla z25.h, z13.h, z6.h[7]\n"
+ "fmla z29.h, z13.h, z7.h[7]\n"
+ "fmla z18.h, z14.h, z4.h[7]\n"
+ "fmla z22.h, z14.h, z5.h[7]\n"
+ "fmla z26.h, z14.h, z6.h[7]\n"
+ "fmla z30.h, z14.h, z7.h[7]\n"
+ "fmla z19.h, z15.h, z4.h[7]\n"
+ "fmla z23.h, z15.h, z5.h[7]\n"
+ "fmla z27.h, z15.h, z6.h[7]\n"
+ "fmla z31.h, z15.h, z7.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "b 5f\n"
+ "4:\n"
+ "fmla z16.h, z8.h, z0.h[0]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z20.h, z8.h, z1.h[0]\n"
+ "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+ "fmla z24.h, z8.h, z2.h[0]\n"
+ "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+ "fmla z28.h, z8.h, z3.h[0]\n"
+ "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+ "fmla z17.h, z9.h, z0.h[0]\n"
+ "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+ "fmla z21.h, z9.h, z1.h[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z25.h, z9.h, z2.h[0]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z29.h, z9.h, z3.h[0]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[0]\n"
+ "fmla z22.h, z10.h, z1.h[0]\n"
+ "fmla z26.h, z10.h, z2.h[0]\n"
+ "fmla z30.h, z10.h, z3.h[0]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[0]\n"
+ "fmla z23.h, z11.h, z1.h[0]\n"
+ "fmla z27.h, z11.h, z2.h[0]\n"
+ "fmla z31.h, z11.h, z3.h[0]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[1]\n"
+ "fmla z24.h, z12.h, z2.h[1]\n"
+ "fmla z28.h, z12.h, z3.h[1]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[1]\n"
+ "fmla z21.h, z13.h, z1.h[1]\n"
+ "fmla z25.h, z13.h, z2.h[1]\n"
+ "fmla z29.h, z13.h, z3.h[1]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[1]\n"
+ "fmla z22.h, z14.h, z1.h[1]\n"
+ "fmla z26.h, z14.h, z2.h[1]\n"
+ "fmla z30.h, z14.h, z3.h[1]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[1]\n"
+ "fmla z23.h, z15.h, z1.h[1]\n"
+ "fmla z27.h, z15.h, z2.h[1]\n"
+ "fmla z31.h, z15.h, z3.h[1]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[2]\n"
+ "fmla z24.h, z8.h, z2.h[2]\n"
+ "fmla z28.h, z8.h, z3.h[2]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[2]\n"
+ "fmla z21.h, z9.h, z1.h[2]\n"
+ "fmla z25.h, z9.h, z2.h[2]\n"
+ "fmla z29.h, z9.h, z3.h[2]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[2]\n"
+ "fmla z22.h, z10.h, z1.h[2]\n"
+ "fmla z26.h, z10.h, z2.h[2]\n"
+ "fmla z30.h, z10.h, z3.h[2]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[2]\n"
+ "fmla z23.h, z11.h, z1.h[2]\n"
+ "fmla z27.h, z11.h, z2.h[2]\n"
+ "fmla z31.h, z11.h, z3.h[2]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[3]\n"
+ "fmla z24.h, z12.h, z2.h[3]\n"
+ "fmla z28.h, z12.h, z3.h[3]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[3]\n"
+ "fmla z21.h, z13.h, z1.h[3]\n"
+ "fmla z25.h, z13.h, z2.h[3]\n"
+ "fmla z29.h, z13.h, z3.h[3]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[3]\n"
+ "fmla z22.h, z14.h, z1.h[3]\n"
+ "fmla z26.h, z14.h, z2.h[3]\n"
+ "fmla z30.h, z14.h, z3.h[3]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[3]\n"
+ "fmla z23.h, z15.h, z1.h[3]\n"
+ "fmla z27.h, z15.h, z2.h[3]\n"
+ "fmla z31.h, z15.h, z3.h[3]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[4]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z8.h, z1.h[4]\n"
+ "fmla z24.h, z8.h, z2.h[4]\n"
+ "fmla z28.h, z8.h, z3.h[4]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z9.h, z0.h[4]\n"
+ "fmla z21.h, z9.h, z1.h[4]\n"
+ "fmla z25.h, z9.h, z2.h[4]\n"
+ "fmla z29.h, z9.h, z3.h[4]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z10.h, z0.h[4]\n"
+ "fmla z22.h, z10.h, z1.h[4]\n"
+ "fmla z26.h, z10.h, z2.h[4]\n"
+ "fmla z30.h, z10.h, z3.h[4]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z11.h, z0.h[4]\n"
+ "fmla z23.h, z11.h, z1.h[4]\n"
+ "fmla z27.h, z11.h, z2.h[4]\n"
+ "fmla z31.h, z11.h, z3.h[4]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z0.h[5]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "fmla z20.h, z12.h, z1.h[5]\n"
+ "fmla z24.h, z12.h, z2.h[5]\n"
+ "fmla z28.h, z12.h, z3.h[5]\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "fmla z17.h, z13.h, z0.h[5]\n"
+ "fmla z21.h, z13.h, z1.h[5]\n"
+ "fmla z25.h, z13.h, z2.h[5]\n"
+ "fmla z29.h, z13.h, z3.h[5]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "fmla z18.h, z14.h, z0.h[5]\n"
+ "fmla z22.h, z14.h, z1.h[5]\n"
+ "fmla z26.h, z14.h, z2.h[5]\n"
+ "fmla z30.h, z14.h, z3.h[5]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "fmla z19.h, z15.h, z0.h[5]\n"
+ "fmla z23.h, z15.h, z1.h[5]\n"
+ "fmla z27.h, z15.h, z2.h[5]\n"
+ "fmla z31.h, z15.h, z3.h[5]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z0.h[6]\n"
+ "fmla z20.h, z8.h, z1.h[6]\n"
+ "fmla z24.h, z8.h, z2.h[6]\n"
+ "fmla z28.h, z8.h, z3.h[6]\n"
+ "fmla z17.h, z9.h, z0.h[6]\n"
+ "fmla z21.h, z9.h, z1.h[6]\n"
+ "fmla z25.h, z9.h, z2.h[6]\n"
+ "fmla z29.h, z9.h, z3.h[6]\n"
+ "fmla z18.h, z10.h, z0.h[6]\n"
+ "fmla z22.h, z10.h, z1.h[6]\n"
+ "fmla z26.h, z10.h, z2.h[6]\n"
+ "fmla z30.h, z10.h, z3.h[6]\n"
+ "fmla z19.h, z11.h, z0.h[6]\n"
+ "fmla z23.h, z11.h, z1.h[6]\n"
+ "fmla z27.h, z11.h, z2.h[6]\n"
+ "fmla z31.h, z11.h, z3.h[6]\n"
+ "fmla z16.h, z12.h, z0.h[7]\n"
+ "fmla z20.h, z12.h, z1.h[7]\n"
+ "fmla z24.h, z12.h, z2.h[7]\n"
+ "fmla z28.h, z12.h, z3.h[7]\n"
+ "fmla z17.h, z13.h, z0.h[7]\n"
+ "fmla z21.h, z13.h, z1.h[7]\n"
+ "fmla z25.h, z13.h, z2.h[7]\n"
+ "fmla z29.h, z13.h, z3.h[7]\n"
+ "fmla z18.h, z14.h, z0.h[7]\n"
+ "fmla z22.h, z14.h, z1.h[7]\n"
+ "fmla z26.h, z14.h, z2.h[7]\n"
+ "fmla z30.h, z14.h, z3.h[7]\n"
+ "fmla z19.h, z15.h, z0.h[7]\n"
+ "fmla z23.h, z15.h, z1.h[7]\n"
+ "fmla z27.h, z15.h, z2.h[7]\n"
+ "fmla z31.h, z15.h, z3.h[7]\n"
+ "cbz %[blocks], 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[0]\n"
+ "fmla z20.h, z8.h, z5.h[0]\n"
+ "fmla z24.h, z8.h, z6.h[0]\n"
+ "fmla z28.h, z8.h, z7.h[0]\n"
+ "fmla z17.h, z9.h, z4.h[0]\n"
+ "fmla z21.h, z9.h, z5.h[0]\n"
+ "fmla z25.h, z9.h, z6.h[0]\n"
+ "fmla z29.h, z9.h, z7.h[0]\n"
+ "fmla z18.h, z10.h, z4.h[0]\n"
+ "fmla z22.h, z10.h, z5.h[0]\n"
+ "fmla z26.h, z10.h, z6.h[0]\n"
+ "fmla z30.h, z10.h, z7.h[0]\n"
+ "fmla z19.h, z11.h, z4.h[0]\n"
+ "fmla z23.h, z11.h, z5.h[0]\n"
+ "fmla z27.h, z11.h, z6.h[0]\n"
+ "fmla z31.h, z11.h, z7.h[0]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[1]\n"
+ "fmla z20.h, z12.h, z5.h[1]\n"
+ "fmla z24.h, z12.h, z6.h[1]\n"
+ "fmla z28.h, z12.h, z7.h[1]\n"
+ "fmla z17.h, z13.h, z4.h[1]\n"
+ "fmla z21.h, z13.h, z5.h[1]\n"
+ "fmla z25.h, z13.h, z6.h[1]\n"
+ "fmla z29.h, z13.h, z7.h[1]\n"
+ "fmla z18.h, z14.h, z4.h[1]\n"
+ "fmla z22.h, z14.h, z5.h[1]\n"
+ "fmla z26.h, z14.h, z6.h[1]\n"
+ "fmla z30.h, z14.h, z7.h[1]\n"
+ "fmla z19.h, z15.h, z4.h[1]\n"
+ "fmla z23.h, z15.h, z5.h[1]\n"
+ "fmla z27.h, z15.h, z6.h[1]\n"
+ "fmla z31.h, z15.h, z7.h[1]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[2]\n"
+ "fmla z20.h, z8.h, z5.h[2]\n"
+ "fmla z24.h, z8.h, z6.h[2]\n"
+ "fmla z28.h, z8.h, z7.h[2]\n"
+ "fmla z17.h, z9.h, z4.h[2]\n"
+ "fmla z21.h, z9.h, z5.h[2]\n"
+ "fmla z25.h, z9.h, z6.h[2]\n"
+ "fmla z29.h, z9.h, z7.h[2]\n"
+ "fmla z18.h, z10.h, z4.h[2]\n"
+ "fmla z22.h, z10.h, z5.h[2]\n"
+ "fmla z26.h, z10.h, z6.h[2]\n"
+ "fmla z30.h, z10.h, z7.h[2]\n"
+ "fmla z19.h, z11.h, z4.h[2]\n"
+ "fmla z23.h, z11.h, z5.h[2]\n"
+ "fmla z27.h, z11.h, z6.h[2]\n"
+ "fmla z31.h, z11.h, z7.h[2]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[3]\n"
+ "fmla z20.h, z12.h, z5.h[3]\n"
+ "fmla z24.h, z12.h, z6.h[3]\n"
+ "fmla z28.h, z12.h, z7.h[3]\n"
+ "fmla z17.h, z13.h, z4.h[3]\n"
+ "fmla z21.h, z13.h, z5.h[3]\n"
+ "fmla z25.h, z13.h, z6.h[3]\n"
+ "fmla z29.h, z13.h, z7.h[3]\n"
+ "fmla z18.h, z14.h, z4.h[3]\n"
+ "fmla z22.h, z14.h, z5.h[3]\n"
+ "fmla z26.h, z14.h, z6.h[3]\n"
+ "fmla z30.h, z14.h, z7.h[3]\n"
+ "fmla z19.h, z15.h, z4.h[3]\n"
+ "fmla z23.h, z15.h, z5.h[3]\n"
+ "fmla z27.h, z15.h, z6.h[3]\n"
+ "fmla z31.h, z15.h, z7.h[3]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[4]\n"
+ "fmla z20.h, z8.h, z5.h[4]\n"
+ "fmla z24.h, z8.h, z6.h[4]\n"
+ "fmla z28.h, z8.h, z7.h[4]\n"
+ "fmla z17.h, z9.h, z4.h[4]\n"
+ "fmla z21.h, z9.h, z5.h[4]\n"
+ "fmla z25.h, z9.h, z6.h[4]\n"
+ "fmla z29.h, z9.h, z7.h[4]\n"
+ "fmla z18.h, z10.h, z4.h[4]\n"
+ "fmla z22.h, z10.h, z5.h[4]\n"
+ "fmla z26.h, z10.h, z6.h[4]\n"
+ "fmla z30.h, z10.h, z7.h[4]\n"
+ "fmla z19.h, z11.h, z4.h[4]\n"
+ "fmla z23.h, z11.h, z5.h[4]\n"
+ "fmla z27.h, z11.h, z6.h[4]\n"
+ "fmla z31.h, z11.h, z7.h[4]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
+ "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z12.h, z4.h[5]\n"
+ "fmla z20.h, z12.h, z5.h[5]\n"
+ "fmla z24.h, z12.h, z6.h[5]\n"
+ "fmla z28.h, z12.h, z7.h[5]\n"
+ "fmla z17.h, z13.h, z4.h[5]\n"
+ "fmla z21.h, z13.h, z5.h[5]\n"
+ "fmla z25.h, z13.h, z6.h[5]\n"
+ "fmla z29.h, z13.h, z7.h[5]\n"
+ "fmla z18.h, z14.h, z4.h[5]\n"
+ "fmla z22.h, z14.h, z5.h[5]\n"
+ "fmla z26.h, z14.h, z6.h[5]\n"
+ "fmla z30.h, z14.h, z7.h[5]\n"
+ "fmla z19.h, z15.h, z4.h[5]\n"
+ "fmla z23.h, z15.h, z5.h[5]\n"
+ "fmla z27.h, z15.h, z6.h[5]\n"
+ "fmla z31.h, z15.h, z7.h[5]\n"
+ "b.eq 5f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+ "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+ "fmla z16.h, z8.h, z4.h[6]\n"
+ "fmla z20.h, z8.h, z5.h[6]\n"
+ "fmla z24.h, z8.h, z6.h[6]\n"
+ "fmla z28.h, z8.h, z7.h[6]\n"
+ "fmla z17.h, z9.h, z4.h[6]\n"
+ "fmla z21.h, z9.h, z5.h[6]\n"
+ "fmla z25.h, z9.h, z6.h[6]\n"
+ "fmla z29.h, z9.h, z7.h[6]\n"
+ "fmla z18.h, z10.h, z4.h[6]\n"
+ "fmla z22.h, z10.h, z5.h[6]\n"
+ "fmla z26.h, z10.h, z6.h[6]\n"
+ "fmla z30.h, z10.h, z7.h[6]\n"
+ "fmla z19.h, z11.h, z4.h[6]\n"
+ "fmla z23.h, z11.h, z5.h[6]\n"
+ "fmla z27.h, z11.h, z6.h[6]\n"
+ "fmla z31.h, z11.h, z7.h[6]\n"
+ "5:\n"
+ "st1h z16.h, p0, [%[c_ptr0]]\n"
+ "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+ "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+ "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #4\n"
+ "st1h z20.h, p0, [c_ptr1]\n"
+ "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+ "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+ "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+ "st1h z24.h, p0, [c_ptr2]\n"
+ "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+ "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+ "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+ "st1h z28.h, p0, [c_ptr3]\n"
+ "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
+ "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
+ "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+ : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
index 9c02d95044..abee1bbe1f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,63 +86,73 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"mov z19.s, #0\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "b 2f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "2:\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "cbz %[loops], 3f\n"
- "4:\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"zip1 z10.b, z10.b, z8.b\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip2 z13.b, z13.b, z14.b\n"
- "subs %[loops], %[loops], #0x1\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -150,137 +160,137 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z0.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z4.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z4.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z4.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z4.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "b.ne 4b\n"
- "3:\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
@@ -289,118 +299,118 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z0.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z9.b, z4.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z4.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
@@ -427,15 +437,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z4.b[3]\n"
"sdot z18.s, z14.b, z4.b[3]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -449,15 +459,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z0.b[0]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -471,13 +481,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -492,31 +502,31 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 10f\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
+ "b.eq 10f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -530,33 +540,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 13f\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
+ "b.eq 13f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -570,33 +580,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 16f\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
+ "b.eq 16f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -610,33 +620,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 19f\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
+ "b.eq 19f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -650,38 +660,38 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z0.b[0]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
@@ -708,15 +718,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -730,15 +740,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z4.b[0]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -752,13 +762,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z4.b[1]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -773,31 +783,31 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z4.b[2]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 25f\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
+ "b.eq 25f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -811,33 +821,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z4.b[3]\n"
"sdot z18.s, z14.b, z4.b[3]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 28f\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
+ "b.eq 28f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -851,33 +861,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z4.b[2]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 31f\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
+ "b.eq 31f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -891,33 +901,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z4.b[1]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 34f\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
+ "b.eq 34f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -931,14 +941,14 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z4.b[0]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
"st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #4\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
);
break;
@@ -971,103 +981,108 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z21.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z22.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z22.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z23.s, #0\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 2f\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
"add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "2:\n"
- "cbz %[loops], 3f\n"
- "4:\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
"zip1 z13.b, z12.b, z13.b\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
"sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip1 z14.b, z15.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr1, a_ptr1, #0x20\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
"sdot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"sdot z20.s, z12.b, z1.b[1]\n"
- "subs %[loops], %[loops], #0x1\n"
"zip2 z12.b, z10.b, z8.b\n"
"zip1 z10.b, z10.b, z8.b\n"
"sdot z17.s, z13.b, z0.b[1]\n"
@@ -1092,148 +1107,148 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z16.s, z8.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z21.s, z13.b, z1.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
"sdot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"sdot z22.s, z14.b, z1.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
"ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip2 z9.b, z9.b, z10.b\n"
- "zip1 z10.b, z11.b, z12.b\n"
- "zip2 z11.b, z11.b, z12.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[0]\n"
"sdot z21.s, z9.b, z5.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
"sdot z21.s, z13.b, z5.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
+ "sdot z16.s, z12.b, z4.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z20.s, z12.b, z5.b[3]\n"
"zip2 z12.b, z10.b, z8.b\n"
"zip1 z10.b, z10.b, z8.b\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
+ "sdot z17.s, z13.b, z4.b[3]\n"
+ "sdot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[3]\n"
+ "sdot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -1245,13 +1260,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -1261,142 +1276,142 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
"sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "sdot z17.s, z13.b, z0.b[3]\n"
"sdot z21.s, z13.b, z1.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z20.s, z8.b, z5.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"sdot z17.s, z9.b, z4.b[0]\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z21.s, z9.b, z5.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z23.s, z11.b, z5.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
+ "sdot z16.s, z12.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z20.s, z12.b, z5.b[1]\n"
"zip2 z12.b, z10.b, z8.b\n"
"zip1 z10.b, z10.b, z8.b\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
+ "sdot z17.s, z13.b, z4.b[1]\n"
+ "sdot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z4.b[1]\n"
+ "sdot z23.s, z15.b, z5.b[1]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z4.b[2]\n"
+ "sdot z20.s, z8.b, z5.b[2]\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z19.s, z11.b, z4.b[2]\n"
+ "sdot z23.s, z11.b, z5.b[2]\n"
"sdot z16.s, z12.b, z4.b[3]\n"
"sdot z20.s, z12.b, z5.b[3]\n"
"sdot z17.s, z13.b, z4.b[3]\n"
@@ -1405,15 +1420,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z14.b, z5.b[3]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1431,15 +1446,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z10.b, z1.b[0]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1457,13 +1472,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z14.b, z1.b[1]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"sdot z23.s, z15.b, z1.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -1482,31 +1497,31 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z10.b, z1.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 10f\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
+ "b.eq 10f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1524,33 +1539,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z14.b, z1.b[3]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 13f\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
+ "b.eq 13f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1568,33 +1583,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z10.b, z1.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 16f\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
+ "b.eq 16f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1612,33 +1627,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z14.b, z1.b[1]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
"sdot z23.s, z15.b, z1.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 19f\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
+ "b.eq 19f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1656,8 +1671,8 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z10.b, z1.b[0]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -1669,13 +1684,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -1685,44 +1700,44 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
+ "sdot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "sdot z20.s, z12.b, z1.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "sdot z19.s, z15.b, z0.b[1]\n"
+ "sdot z23.s, z15.b, z1.b[1]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "sdot z16.s, z8.b, z0.b[2]\n"
+ "sdot z20.s, z8.b, z1.b[2]\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z19.s, z11.b, z0.b[2]\n"
+ "sdot z23.s, z11.b, z1.b[2]\n"
"sdot z16.s, z12.b, z0.b[3]\n"
"sdot z20.s, z12.b, z1.b[3]\n"
"sdot z17.s, z13.b, z0.b[3]\n"
@@ -1731,15 +1746,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z14.b, z1.b[3]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1757,15 +1772,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z10.b, z5.b[0]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
"sdot z23.s, z11.b, z5.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1783,13 +1798,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z14.b, z5.b[1]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
"sdot z23.s, z15.b, z5.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -1808,31 +1823,31 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z10.b, z5.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 25f\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
+ "b.eq 25f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1850,33 +1865,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z14.b, z5.b[3]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 28f\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
+ "b.eq 28f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1894,33 +1909,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z10.b, z5.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 31f\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
+ "b.eq 31f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1938,33 +1953,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z14.b, z5.b[1]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
"sdot z23.s, z15.b, z5.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 34f\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
+ "b.eq 34f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1982,7 +1997,7 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z22.s, z10.b, z5.b[0]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
"sdot z23.s, z11.b, z5.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -1995,7 +2010,7 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
".unreq a_ptr1\n"
".unreq c_ptr1\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
);
break;
@@ -2007,11 +2022,11 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"c_ptr2 .req X3\n"
"add a_ptr1, %[a_ptr0], %[lda]\n"
"add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
"whilelt p6.b, %[temp], %[leftovers]\n"
"whilelt p0.s, %[temp], %[width]\n"
"whilelt p4.b, %[temp], %[width]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
"incw %[temp], all, mul #1\n"
"ptrue p7.b\n"
"whilelt p1.s, %[temp], %[width]\n"
@@ -2034,116 +2049,122 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z22.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z23.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"mov z24.s, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "mov z25.s, #0\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "mov z26.s, #0\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"mov z27.s, #0\n"
- "b 2f\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
"mul z24.s, p7/m, z24.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z25.s, p7/m, z25.s, z15.s\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z26.s, p7/m, z26.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
"mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "2:\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "cbz %[loops], 3f\n"
- "4:\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "sdot z16.s, z8.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
"zip1 z13.b, z12.b, z13.b\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z24.s, z8.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip1 z14.b, z15.b, z8.b\n"
"add a_ptr1, a_ptr1, #0x20\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
"sdot z27.s, z11.b, z2.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2157,15 +2178,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
@@ -2185,15 +2206,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
@@ -2213,15 +2234,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z13.b, z1.b[3]\n"
"sdot z25.s, z13.b, z2.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
"sdot z26.s, z14.b, z2.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
@@ -2244,15 +2265,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z9.b, z5.b[0]\n"
"sdot z25.s, z9.b, z6.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
"sdot z26.s, z10.b, z6.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
@@ -2272,15 +2293,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z13.b, z5.b[1]\n"
"sdot z25.s, z13.b, z6.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z5.b[1]\n"
"sdot z26.s, z14.b, z6.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[1]\n"
@@ -2300,15 +2321,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z25.s, z9.b, z6.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
"sdot z26.s, z10.b, z6.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
@@ -2328,23 +2349,23 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z13.b, z5.b[3]\n"
"sdot z25.s, z13.b, z6.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[3]\n"
"sdot z22.s, z14.b, z5.b[3]\n"
"sdot z26.s, z14.b, z6.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -2353,24 +2374,24 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
@@ -2390,15 +2411,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[1]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[1]\n"
@@ -2418,15 +2439,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
@@ -2446,15 +2467,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z13.b, z1.b[3]\n"
"sdot z25.s, z13.b, z2.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
+ "sdot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z14.b, z1.b[3]\n"
"sdot z26.s, z14.b, z2.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z15.b, z0.b[3]\n"
@@ -2477,15 +2498,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z9.b, z5.b[0]\n"
"sdot z25.s, z9.b, z6.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
+ "sdot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z5.b[0]\n"
"sdot z26.s, z10.b, z6.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z4.b[0]\n"
@@ -2505,12 +2526,12 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z13.b, z5.b[1]\n"
"sdot z25.s, z13.b, z6.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z4.b[1]\n"
"sdot z22.s, z14.b, z5.b[1]\n"
"sdot z26.s, z14.b, z6.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2523,17 +2544,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z16.s, z8.b, z4.b[2]\n"
"sdot z20.s, z8.b, z5.b[2]\n"
"sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z4.b[2]\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z4.b[2]\n"
+ "sdot z22.s, z10.b, z5.b[2]\n"
"sdot z26.s, z10.b, z6.b[2]\n"
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
@@ -2550,15 +2571,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2580,15 +2601,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z0.b[0]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
"sdot z27.s, z11.b, z2.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2610,13 +2631,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z15.b, z0.b[1]\n"
"sdot z23.s, z15.b, z1.b[1]\n"
"sdot z27.s, z15.b, z2.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -2639,31 +2660,31 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
"sdot z27.s, z11.b, z2.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 10f\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
+ "b.eq 10f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -2685,33 +2706,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z15.b, z0.b[3]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
"sdot z27.s, z15.b, z2.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 13f\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
+ "b.eq 13f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -2733,33 +2754,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
"sdot z27.s, z11.b, z2.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 16f\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
+ "b.eq 16f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -2781,33 +2802,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z15.b, z0.b[1]\n"
"sdot z23.s, z15.b, z1.b[1]\n"
"sdot z27.s, z15.b, z2.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 19f\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
+ "b.eq 19f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -2829,8 +2850,8 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z0.b[0]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
"sdot z27.s, z11.b, z2.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -2839,24 +2860,24 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
"sdot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "sdot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
@@ -2876,12 +2897,12 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z18.s, z14.b, z0.b[1]\n"
"sdot z22.s, z14.b, z1.b[1]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2894,17 +2915,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z16.s, z8.b, z0.b[2]\n"
"sdot z20.s, z8.b, z1.b[2]\n"
"sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z18.s, z10.b, z0.b[2]\n"
+ "sdot z22.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"sdot z19.s, z11.b, z0.b[2]\n"
"sdot z23.s, z11.b, z1.b[2]\n"
@@ -2921,15 +2942,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z15.b, z0.b[3]\n"
"sdot z23.s, z15.b, z1.b[3]\n"
"sdot z27.s, z15.b, z2.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2951,15 +2972,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z4.b[0]\n"
"sdot z23.s, z11.b, z5.b[0]\n"
"sdot z27.s, z11.b, z6.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2981,13 +3002,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z15.b, z4.b[1]\n"
"sdot z23.s, z15.b, z5.b[1]\n"
"sdot z27.s, z15.b, z6.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3010,31 +3031,31 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
"sdot z27.s, z11.b, z6.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 25f\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
+ "b.eq 25f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -3056,33 +3077,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z15.b, z4.b[3]\n"
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 28f\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
+ "b.eq 28f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -3104,33 +3125,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z4.b[2]\n"
"sdot z23.s, z11.b, z5.b[2]\n"
"sdot z27.s, z11.b, z6.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 31f\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
+ "b.eq 31f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -3152,33 +3173,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z15.b, z4.b[1]\n"
"sdot z23.s, z15.b, z5.b[1]\n"
"sdot z27.s, z15.b, z6.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 34f\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
+ "b.eq 34f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -3200,7 +3221,7 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z19.s, z11.b, z4.b[0]\n"
"sdot z23.s, z11.b, z5.b[0]\n"
"sdot z27.s, z11.b, z6.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -3219,7 +3240,7 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
".unreq c_ptr1\n"
".unreq c_ptr2\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
@@ -3234,15 +3255,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"c_ptr3 .req X5\n"
"add a_ptr1, %[a_ptr0], %[lda]\n"
"add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
"whilelt p6.b, %[temp], %[leftovers]\n"
"whilelt p0.s, %[temp], %[width]\n"
"whilelt p4.b, %[temp], %[width]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
"incw %[temp], all, mul #1\n"
"ptrue p7.b\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
"whilelt p1.s, %[temp], %[width]\n"
"incw %[temp], all, mul #1\n"
"whilelt p2.s, %[temp], %[width]\n"
@@ -3265,77 +3286,80 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z23.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z24.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"mov z25.s, #0\n"
"add a_ptr3, a_ptr3, #0x10\n"
- "mov z26.s, #0\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "mov z27.s, #0\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "mov z28.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "mov z26.s, #0\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z9.b, z9.b, z10.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z28.s, #0\n"
"mov z29.s, #0\n"
"mov z30.s, #0\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
"mov z31.s, #0\n"
- "b 2f\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
"mul z24.s, p7/m, z24.s, z15.s\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
"mul z25.s, p7/m, z25.s, z15.s\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
"mul z26.s, p7/m, z26.s, z15.s\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
"mul z27.s, p7/m, z27.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
"mul z28.s, p7/m, z28.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z29.s, p7/m, z29.s, z15.s\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z30.s, p7/m, z30.s, z15.s\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
"mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add a_ptr2, a_ptr2, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
"add a_ptr3, a_ptr3, #0x10\n"
@@ -3344,21 +3368,20 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "2:\n"
- "cbz %[loops], 3f\n"
- "4:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -3367,38 +3390,38 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"sdot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "add a_ptr1, a_ptr1, #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
- "add a_ptr2, a_ptr2, #0x20\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"sdot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
"sdot z26.s, z10.b, z2.b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"sdot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"sdot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"sdot z23.s, z11.b, z1.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
"sdot z27.s, z11.b, z2.b[0]\n"
"sdot z31.s, z11.b, z3.b[0]\n"
"zip2 z11.b, z8.b, z9.b\n"
@@ -3414,17 +3437,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"sdot z30.s, z14.b, z3.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3446,17 +3469,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"sdot z30.s, z10.b, z3.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3478,17 +3501,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z21.s, z13.b, z1.b[3]\n"
"sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z2.b[3]\n"
"sdot z30.s, z14.b, z3.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3514,17 +3537,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z4.b[0]\n"
"sdot z21.s, z9.b, z5.b[0]\n"
"sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z6.b[0]\n"
"sdot z30.s, z10.b, z7.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3546,17 +3569,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z4.b[1]\n"
"sdot z21.s, z13.b, z5.b[1]\n"
"sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z6.b[1]\n"
"sdot z30.s, z14.b, z7.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3578,17 +3601,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z4.b[2]\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z6.b[2]\n"
"sdot z30.s, z10.b, z7.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3610,13 +3633,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z4.b[3]\n"
"sdot z21.s, z13.b, z5.b[3]\n"
"sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z29.s, z13.b, z7.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[3]\n"
"sdot z22.s, z14.b, z5.b[3]\n"
"sdot z26.s, z14.b, z6.b[3]\n"
@@ -3626,11 +3649,11 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
"sdot z31.s, z15.b, z7.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -3639,27 +3662,27 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"sdot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
"sdot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"sdot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3681,17 +3704,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
"sdot z30.s, z14.b, z3.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3713,17 +3736,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
"sdot z30.s, z10.b, z3.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3745,17 +3768,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z0.b[3]\n"
"sdot z21.s, z13.b, z1.b[3]\n"
"sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z14.b, z2.b[3]\n"
"sdot z30.s, z14.b, z3.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3781,17 +3804,17 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z4.b[0]\n"
"sdot z21.s, z9.b, z5.b[0]\n"
"sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"sdot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"sdot z26.s, z10.b, z6.b[0]\n"
"sdot z30.s, z10.b, z7.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3813,13 +3836,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z4.b[1]\n"
"sdot z21.s, z13.b, z5.b[1]\n"
"sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z4.b[1]\n"
"sdot z22.s, z14.b, z5.b[1]\n"
"sdot z26.s, z14.b, z6.b[1]\n"
@@ -3841,11 +3864,11 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z4.b[2]\n"
"sdot z21.s, z9.b, z5.b[2]\n"
"sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z29.s, z9.b, z7.b[2]\n"
"sdot z18.s, z10.b, z4.b[2]\n"
"sdot z22.s, z10.b, z5.b[2]\n"
"sdot z26.s, z10.b, z6.b[2]\n"
@@ -3870,15 +3893,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
"sdot z31.s, z15.b, z7.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -3904,15 +3927,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z11.b, z1.b[0]\n"
"sdot z27.s, z11.b, z2.b[0]\n"
"sdot z31.s, z11.b, z3.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -3938,13 +3961,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z15.b, z1.b[1]\n"
"sdot z27.s, z15.b, z2.b[1]\n"
"sdot z31.s, z15.b, z3.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3971,31 +3994,31 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z11.b, z1.b[2]\n"
"sdot z27.s, z11.b, z2.b[2]\n"
"sdot z31.s, z11.b, z3.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 10f\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
+ "b.eq 10f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4021,33 +4044,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z15.b, z1.b[3]\n"
"sdot z27.s, z15.b, z2.b[3]\n"
"sdot z31.s, z15.b, z3.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 13f\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
+ "b.eq 13f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4073,33 +4096,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z11.b, z1.b[2]\n"
"sdot z27.s, z11.b, z2.b[2]\n"
"sdot z31.s, z11.b, z3.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 16f\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
+ "b.eq 16f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4125,33 +4148,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z15.b, z1.b[1]\n"
"sdot z27.s, z15.b, z2.b[1]\n"
"sdot z31.s, z15.b, z3.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 19f\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
+ "b.eq 19f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4177,8 +4200,8 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z11.b, z1.b[0]\n"
"sdot z27.s, z11.b, z2.b[0]\n"
"sdot z31.s, z11.b, z3.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"sdot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"sdot z20.s, z8.b, z1.b[0]\n"
@@ -4187,27 +4210,27 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
"sdot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "sdot z25.s, z9.b, z2.b[0]\n"
"sdot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "sdot z22.s, z10.b, z1.b[0]\n"
"sdot z26.s, z10.b, z2.b[0]\n"
"sdot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -4229,13 +4252,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z13.b, z0.b[1]\n"
"sdot z21.s, z13.b, z1.b[1]\n"
"sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"sdot z18.s, z14.b, z0.b[1]\n"
"sdot z22.s, z14.b, z1.b[1]\n"
"sdot z26.s, z14.b, z2.b[1]\n"
@@ -4257,11 +4280,11 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z17.s, z9.b, z0.b[2]\n"
"sdot z21.s, z9.b, z1.b[2]\n"
"sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "sdot z29.s, z9.b, z3.b[2]\n"
"sdot z18.s, z10.b, z0.b[2]\n"
"sdot z22.s, z10.b, z1.b[2]\n"
"sdot z26.s, z10.b, z2.b[2]\n"
@@ -4286,15 +4309,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z15.b, z1.b[3]\n"
"sdot z27.s, z15.b, z2.b[3]\n"
"sdot z31.s, z15.b, z3.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -4320,15 +4343,15 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z11.b, z5.b[0]\n"
"sdot z27.s, z11.b, z6.b[0]\n"
"sdot z31.s, z11.b, z7.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -4354,13 +4377,13 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z15.b, z5.b[1]\n"
"sdot z27.s, z15.b, z6.b[1]\n"
"sdot z31.s, z15.b, z7.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -4387,31 +4410,31 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z11.b, z5.b[2]\n"
"sdot z27.s, z11.b, z6.b[2]\n"
"sdot z31.s, z11.b, z7.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 25f\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
+ "b.eq 25f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4437,33 +4460,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z15.b, z5.b[3]\n"
"sdot z27.s, z15.b, z6.b[3]\n"
"sdot z31.s, z15.b, z7.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 28f\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
+ "b.eq 28f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4489,33 +4512,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z11.b, z5.b[2]\n"
"sdot z27.s, z11.b, z6.b[2]\n"
"sdot z31.s, z11.b, z7.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 31f\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
+ "b.eq 31f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4541,33 +4564,33 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z15.b, z5.b[1]\n"
"sdot z27.s, z15.b, z6.b[1]\n"
"sdot z31.s, z15.b, z7.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 34f\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
+ "b.eq 34f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4593,7 +4616,7 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
"sdot z23.s, z11.b, z5.b[0]\n"
"sdot z27.s, z11.b, z6.b[0]\n"
"sdot z31.s, z11.b, z7.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -4618,7 +4641,7 @@ void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int l
".unreq c_ptr2\n"
".unreq c_ptr3\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
index 7d89948dc1..cdcea59c5e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,63 +86,73 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"mov z19.s, #0\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "b 2f\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip2 z11.b, z8.b, z9.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z9.b, z8.b, z9.b\n"
+ "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "2:\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "cbz %[loops], 3f\n"
- "4:\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"zip1 z10.b, z10.b, z8.b\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
- "udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip2 z13.b, z13.b, z14.b\n"
- "subs %[loops], %[loops], #0x1\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -150,137 +160,137 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z0.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z4.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z4.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z4.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z4.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z4.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "b.ne 4b\n"
- "3:\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
@@ -289,118 +299,118 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z0.b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z0.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z9.b, z4.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z4.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
@@ -427,15 +437,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z4.b[3]\n"
"udot z18.s, z14.b, z4.b[3]\n"
"udot z19.s, z15.b, z4.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -449,15 +459,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z0.b[0]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"udot z19.s, z11.b, z0.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -471,13 +481,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z0.b[1]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"udot z19.s, z15.b, z0.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -492,31 +502,31 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z0.b[2]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 10f\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
+ "b.eq 10f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -530,33 +540,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z0.b[3]\n"
"udot z18.s, z14.b, z0.b[3]\n"
"udot z19.s, z15.b, z0.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 13f\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
+ "b.eq 13f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -570,33 +580,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z0.b[2]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 16f\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
+ "b.eq 16f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -610,33 +620,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z0.b[1]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"udot z19.s, z15.b, z0.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 19f\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
+ "b.eq 19f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -650,38 +660,38 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z0.b[0]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"udot z19.s, z11.b, z0.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z16.s, z12.b, z0.b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z17.s, z13.b, z0.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
@@ -708,15 +718,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z0.b[3]\n"
"udot z18.s, z14.b, z0.b[3]\n"
"udot z19.s, z15.b, z0.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -730,15 +740,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z4.b[0]\n"
"udot z18.s, z10.b, z4.b[0]\n"
"udot z19.s, z11.b, z4.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -752,13 +762,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z4.b[1]\n"
"udot z18.s, z14.b, z4.b[1]\n"
"udot z19.s, z15.b, z4.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -773,31 +783,31 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z4.b[2]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 25f\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
+ "b.eq 25f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -811,33 +821,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z4.b[3]\n"
"udot z18.s, z14.b, z4.b[3]\n"
"udot z19.s, z15.b, z4.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 28f\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
+ "b.eq 28f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -851,33 +861,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z4.b[2]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 31f\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
+ "b.eq 31f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -891,33 +901,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z4.b[1]\n"
"udot z18.s, z14.b, z4.b[1]\n"
"udot z19.s, z15.b, z4.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 34f\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
+ "b.eq 34f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -931,14 +941,14 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z4.b[0]\n"
"udot z18.s, z10.b, z4.b[0]\n"
"udot z19.s, z11.b, z4.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
"st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #4\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
);
break;
@@ -971,103 +981,108 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z21.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z22.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z22.s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z23.s, #0\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 2f\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
"add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "2:\n"
- "cbz %[loops], 3f\n"
- "4:\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
"zip1 z13.b, z12.b, z13.b\n"
- "udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+ "subs %[loops], %[loops], #0x1\n"
"udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip2 z13.b, z13.b, z14.b\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip1 z14.b, z15.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr1, a_ptr1, #0x20\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
"udot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"udot z20.s, z12.b, z1.b[1]\n"
- "subs %[loops], %[loops], #0x1\n"
"zip2 z12.b, z10.b, z8.b\n"
"zip1 z10.b, z10.b, z8.b\n"
"udot z17.s, z13.b, z0.b[1]\n"
@@ -1092,148 +1107,148 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z16.s, z8.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"udot z17.s, z13.b, z0.b[3]\n"
"udot z21.s, z13.b, z1.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
"udot z18.s, z14.b, z0.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"udot z22.s, z14.b, z1.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
"udot z23.s, z15.b, z1.b[3]\n"
"ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "zip2 z9.b, z9.b, z10.b\n"
- "zip1 z10.b, z11.b, z12.b\n"
- "zip2 z11.b, z11.b, z12.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[0]\n"
"udot z21.s, z9.b, z5.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
"udot z21.s, z13.b, z5.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
"udot z21.s, z9.b, z5.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[2]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
+ "udot z16.s, z12.b, z4.b[3]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z20.s, z12.b, z5.b[3]\n"
"zip2 z12.b, z10.b, z8.b\n"
"zip1 z10.b, z10.b, z8.b\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
+ "udot z17.s, z13.b, z4.b[3]\n"
+ "udot z21.s, z13.b, z5.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z5.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[3]\n"
+ "udot z23.s, z15.b, z5.b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -1245,13 +1260,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -1261,142 +1276,142 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
"udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z12.b, z0.b[3]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[3]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
+ "udot z17.s, z13.b, z0.b[3]\n"
"udot z21.s, z13.b, z1.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z23.s, z15.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "zip2 z15.b, z12.b, z13.b\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "zip1 z13.b, z12.b, z13.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z16.s, z8.b, z4.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z20.s, z8.b, z5.b[0]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"udot z17.s, z9.b, z4.b[0]\n"
- "zip2 z15.b, z12.b, z13.b\n"
- "zip1 z13.b, z12.b, z13.b\n"
- "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z21.s, z9.b, z5.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z11.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z23.s, z11.b, z5.b[0]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
+ "udot z16.s, z12.b, z4.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z20.s, z12.b, z5.b[1]\n"
"zip2 z12.b, z10.b, z8.b\n"
"zip1 z10.b, z10.b, z8.b\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
+ "udot z17.s, z13.b, z4.b[1]\n"
+ "udot z21.s, z13.b, z5.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z4.b[1]\n"
+ "udot z23.s, z15.b, z5.b[1]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z4.b[2]\n"
+ "udot z20.s, z8.b, z5.b[2]\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"udot z21.s, z9.b, z5.b[2]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"udot z22.s, z10.b, z5.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "udot z23.s, z11.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z19.s, z11.b, z4.b[2]\n"
+ "udot z23.s, z11.b, z5.b[2]\n"
"udot z16.s, z12.b, z4.b[3]\n"
"udot z20.s, z12.b, z5.b[3]\n"
"udot z17.s, z13.b, z4.b[3]\n"
@@ -1405,15 +1420,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z14.b, z5.b[3]\n"
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1431,15 +1446,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z10.b, z1.b[0]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1457,13 +1472,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z14.b, z1.b[1]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"udot z23.s, z15.b, z1.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -1482,31 +1497,31 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z10.b, z1.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 10f\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
+ "b.eq 10f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1524,33 +1539,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z14.b, z1.b[3]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"udot z23.s, z15.b, z1.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 13f\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
+ "b.eq 13f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1568,33 +1583,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z10.b, z1.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 16f\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
+ "b.eq 16f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1612,33 +1627,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z14.b, z1.b[1]\n"
"udot z19.s, z15.b, z0.b[1]\n"
"udot z23.s, z15.b, z1.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 19f\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
+ "b.eq 19f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1656,8 +1671,8 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z10.b, z1.b[0]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -1669,13 +1684,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"zip2 z8.b, z14.b, z12.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "zip1 z12.b, z13.b, z14.b\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
@@ -1685,44 +1700,44 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
+ "udot z16.s, z12.b, z0.b[1]\n"
"add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "udot z20.s, z12.b, z1.b[1]\n"
+ "zip2 z12.b, z10.b, z8.b\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "udot z19.s, z15.b, z0.b[1]\n"
+ "udot z23.s, z15.b, z1.b[1]\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+ "udot z16.s, z8.b, z0.b[2]\n"
+ "udot z20.s, z8.b, z1.b[2]\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
+ "zip2 z8.b, z14.b, z12.b\n"
+ "zip1 z14.b, z14.b, z12.b\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"udot z22.s, z10.b, z1.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "zip2 z8.b, z14.b, z12.b\n"
- "zip1 z14.b, z14.b, z12.b\n"
- "udot z23.s, z11.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z19.s, z11.b, z0.b[2]\n"
+ "udot z23.s, z11.b, z1.b[2]\n"
"udot z16.s, z12.b, z0.b[3]\n"
"udot z20.s, z12.b, z1.b[3]\n"
"udot z17.s, z13.b, z0.b[3]\n"
@@ -1731,15 +1746,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z14.b, z1.b[3]\n"
"udot z19.s, z15.b, z0.b[3]\n"
"udot z23.s, z15.b, z1.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1757,15 +1772,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z10.b, z5.b[0]\n"
"udot z19.s, z11.b, z4.b[0]\n"
"udot z23.s, z11.b, z5.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1783,13 +1798,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z14.b, z5.b[1]\n"
"udot z19.s, z15.b, z4.b[1]\n"
"udot z23.s, z15.b, z5.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -1808,31 +1823,31 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z10.b, z5.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 25f\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
+ "b.eq 25f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1850,33 +1865,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z14.b, z5.b[3]\n"
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 28f\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
+ "b.eq 28f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1894,33 +1909,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z10.b, z5.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 31f\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
+ "b.eq 31f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -1938,33 +1953,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z14.b, z5.b[1]\n"
"udot z19.s, z15.b, z4.b[1]\n"
"udot z23.s, z15.b, z5.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 34f\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
+ "b.eq 34f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -1982,7 +1997,7 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z22.s, z10.b, z5.b[0]\n"
"udot z19.s, z11.b, z4.b[0]\n"
"udot z23.s, z11.b, z5.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -1995,7 +2010,7 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
".unreq a_ptr1\n"
".unreq c_ptr1\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
);
break;
@@ -2007,11 +2022,11 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"c_ptr2 .req X3\n"
"add a_ptr1, %[a_ptr0], %[lda]\n"
"add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
"whilelt p6.b, %[temp], %[leftovers]\n"
"whilelt p0.s, %[temp], %[width]\n"
"whilelt p4.b, %[temp], %[width]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
"incw %[temp], all, mul #1\n"
"ptrue p7.b\n"
"whilelt p1.s, %[temp], %[width]\n"
@@ -2034,116 +2049,122 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z22.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z23.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z23.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"mov z24.s, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "mov z25.s, #0\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "mov z26.s, #0\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z25.s, #0\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+ "mov z26.s, #0\n"
+ "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "zip2 z9.b, z9.b, z10.b\n"
+ "zip1 z10.b, z11.b, z12.b\n"
+ "zip2 z11.b, z11.b, z12.b\n"
+ "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"mov z27.s, #0\n"
- "b 2f\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
"mul z24.s, p7/m, z24.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z25.s, p7/m, z25.s, z15.s\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z26.s, p7/m, z26.s, z15.s\n"
- "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
"mul z27.s, p7/m, z27.s, z15.s\n"
+ "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "2:\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "cbz %[loops], 3f\n"
- "4:\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ "udot z16.s, z8.b, z0.b[0]\n"
+ "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
"zip2 z15.b, z12.b, z13.b\n"
+ "ld1rqb z5.b, p7/z, [a_ptr1]\n"
"zip1 z13.b, z12.b, z13.b\n"
- "udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z24.s, z8.b, z2.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"add %[a_ptr0], %[a_ptr0], #0x20\n"
"zip1 z14.b, z15.b, z8.b\n"
"add a_ptr1, a_ptr1, #0x20\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"udot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
"udot z27.s, z11.b, z2.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2157,15 +2178,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
@@ -2185,15 +2206,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
@@ -2213,15 +2234,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z13.b, z1.b[3]\n"
"udot z25.s, z13.b, z2.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
"udot z26.s, z14.b, z2.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
@@ -2244,15 +2265,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z9.b, z5.b[0]\n"
"udot z25.s, z9.b, z6.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
"udot z26.s, z10.b, z6.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[0]\n"
@@ -2272,15 +2293,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z13.b, z5.b[1]\n"
"udot z25.s, z13.b, z6.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z5.b[1]\n"
"udot z26.s, z14.b, z6.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[1]\n"
@@ -2300,15 +2321,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z9.b, z5.b[2]\n"
"udot z25.s, z9.b, z6.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
"udot z26.s, z10.b, z6.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[2]\n"
@@ -2328,23 +2349,23 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z13.b, z5.b[3]\n"
"udot z25.s, z13.b, z6.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[3]\n"
"udot z22.s, z14.b, z5.b[3]\n"
"udot z26.s, z14.b, z6.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
+ "b.ne 3b\n"
+ "2:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "cbz %[regs], 4f\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -2353,24 +2374,24 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
@@ -2390,15 +2411,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[1]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[1]\n"
@@ -2418,15 +2439,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[2]\n"
@@ -2446,15 +2467,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z13.b, z1.b[3]\n"
"udot z25.s, z13.b, z2.b[3]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
+ "udot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z14.b, z1.b[3]\n"
"udot z26.s, z14.b, z2.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z15.b, z0.b[3]\n"
@@ -2477,15 +2498,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z9.b, z5.b[0]\n"
"udot z25.s, z9.b, z6.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
+ "udot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z5.b[0]\n"
"udot z26.s, z10.b, z6.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z4.b[0]\n"
@@ -2505,12 +2526,12 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z13.b, z5.b[1]\n"
"udot z25.s, z13.b, z6.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z4.b[1]\n"
"udot z22.s, z14.b, z5.b[1]\n"
"udot z26.s, z14.b, z6.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2523,17 +2544,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z16.s, z8.b, z4.b[2]\n"
"udot z20.s, z8.b, z5.b[2]\n"
"udot z24.s, z8.b, z6.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z4.b[2]\n"
"udot z21.s, z9.b, z5.b[2]\n"
"udot z25.s, z9.b, z6.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z4.b[2]\n"
+ "udot z22.s, z10.b, z5.b[2]\n"
"udot z26.s, z10.b, z6.b[2]\n"
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
@@ -2550,15 +2571,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2580,15 +2601,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z0.b[0]\n"
"udot z23.s, z11.b, z1.b[0]\n"
"udot z27.s, z11.b, z2.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2610,13 +2631,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z15.b, z0.b[1]\n"
"udot z23.s, z15.b, z1.b[1]\n"
"udot z27.s, z15.b, z2.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -2639,31 +2660,31 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
"udot z27.s, z11.b, z2.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 10f\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
+ "b.eq 10f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -2685,33 +2706,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z15.b, z0.b[3]\n"
"udot z23.s, z15.b, z1.b[3]\n"
"udot z27.s, z15.b, z2.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 13f\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
+ "b.eq 13f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -2733,33 +2754,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
"udot z27.s, z11.b, z2.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 16f\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
+ "b.eq 16f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -2781,33 +2802,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z15.b, z0.b[1]\n"
"udot z23.s, z15.b, z1.b[1]\n"
"udot z27.s, z15.b, z2.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 19f\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
+ "b.eq 19f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -2829,8 +2850,8 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z0.b[0]\n"
"udot z23.s, z11.b, z1.b[0]\n"
"udot z27.s, z11.b, z2.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -2839,24 +2860,24 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
"udot z17.s, z9.b, z0.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "udot z18.s, z10.b, z0.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
@@ -2876,12 +2897,12 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z18.s, z14.b, z0.b[1]\n"
"udot z22.s, z14.b, z1.b[1]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2894,17 +2915,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z16.s, z8.b, z0.b[2]\n"
"udot z20.s, z8.b, z1.b[2]\n"
"udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
"zip2 z8.b, z14.b, z12.b\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z18.s, z10.b, z0.b[2]\n"
+ "udot z22.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"udot z19.s, z11.b, z0.b[2]\n"
"udot z23.s, z11.b, z1.b[2]\n"
@@ -2921,15 +2942,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z15.b, z0.b[3]\n"
"udot z23.s, z15.b, z1.b[3]\n"
"udot z27.s, z15.b, z2.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2951,15 +2972,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z4.b[0]\n"
"udot z23.s, z11.b, z5.b[0]\n"
"udot z27.s, z11.b, z6.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2981,13 +3002,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z15.b, z4.b[1]\n"
"udot z23.s, z15.b, z5.b[1]\n"
"udot z27.s, z15.b, z6.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3010,31 +3031,31 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
"udot z27.s, z11.b, z6.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 25f\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
+ "b.eq 25f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -3056,33 +3077,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z15.b, z4.b[3]\n"
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 28f\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
+ "b.eq 28f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -3104,33 +3125,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z4.b[2]\n"
"udot z23.s, z11.b, z5.b[2]\n"
"udot z27.s, z11.b, z6.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 31f\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
+ "b.eq 31f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -3152,33 +3173,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z15.b, z4.b[1]\n"
"udot z23.s, z15.b, z5.b[1]\n"
"udot z27.s, z15.b, z6.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 34f\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
+ "b.eq 34f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -3200,7 +3221,7 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z19.s, z11.b, z4.b[0]\n"
"udot z23.s, z11.b, z5.b[0]\n"
"udot z27.s, z11.b, z6.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -3219,7 +3240,7 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
".unreq c_ptr1\n"
".unreq c_ptr2\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
);
break;
@@ -3234,15 +3255,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"c_ptr3 .req X5\n"
"add a_ptr1, %[a_ptr0], %[lda]\n"
"add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
"whilelt p6.b, %[temp], %[leftovers]\n"
"whilelt p0.s, %[temp], %[width]\n"
"whilelt p4.b, %[temp], %[width]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
"incw %[temp], all, mul #1\n"
"ptrue p7.b\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
"whilelt p1.s, %[temp], %[width]\n"
"incw %[temp], all, mul #1\n"
"whilelt p2.s, %[temp], %[width]\n"
@@ -3265,77 +3286,80 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"mov z23.s, #0\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z24.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+ "mov z24.s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
"mov z25.s, #0\n"
"add a_ptr3, a_ptr3, #0x10\n"
- "mov z26.s, #0\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z10.b, z10.b, z8.b\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "mov z27.s, #0\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "mov z28.s, #0\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "mov z26.s, #0\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "mov z27.s, #0\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
"zip2 z9.b, z9.b, z10.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "mov z28.s, #0\n"
"mov z29.s, #0\n"
"mov z30.s, #0\n"
+ "zip2 z15.b, z12.b, z13.b\n"
+ "zip1 z13.b, z12.b, z13.b\n"
"mov z31.s, #0\n"
- "b 2f\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
"1:\n"
"ld1rw z15.s, p7/z, [%[betaptr]]\n"
"ld1w z16.s, p0/z, [%[c_ptr0]]\n"
"ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
"ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
"ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z16.s, p7/m, z16.s, z15.s\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+ "ld1w z20.s, p0/z, [c_ptr1]\n"
"mul z17.s, p7/m, z17.s, z15.s\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+ "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
"mul z18.s, p7/m, z18.s, z15.s\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+ "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
"mul z19.s, p7/m, z19.s, z15.s\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
+ "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
"mul z20.s, p7/m, z20.s, z15.s\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+ "ld1w z24.s, p0/z, [c_ptr2]\n"
"mul z21.s, p7/m, z21.s, z15.s\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+ "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
"mul z22.s, p7/m, z22.s, z15.s\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+ "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
"mul z23.s, p7/m, z23.s, z15.s\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
+ "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
"mul z24.s, p7/m, z24.s, z15.s\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+ "ld1w z28.s, p0/z, [c_ptr3]\n"
"mul z25.s, p7/m, z25.s, z15.s\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+ "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
"mul z26.s, p7/m, z26.s, z15.s\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+ "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
"mul z27.s, p7/m, z27.s, z15.s\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
"mul z28.s, p7/m, z28.s, z15.s\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"mul z29.s, p7/m, z29.s, z15.s\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"mul z30.s, p7/m, z30.s, z15.s\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
"mul z31.s, p7/m, z31.s, z15.s\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"add a_ptr1, a_ptr1, #0x10\n"
+ "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"add a_ptr2, a_ptr2, #0x10\n"
"zip2 z11.b, z8.b, z9.b\n"
"add a_ptr3, a_ptr3, #0x10\n"
@@ -3344,21 +3368,20 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z12.b, z10.b, z8.b\n"
- "zip1 z10.b, z10.b, z8.b\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "zip1 z10.b, z10.b, z8.b\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "2:\n"
- "cbz %[loops], 3f\n"
- "4:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -3367,38 +3390,38 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"udot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
- "add a_ptr1, a_ptr1, #0x20\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
- "add a_ptr2, a_ptr2, #0x20\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
"udot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
"udot z26.s, z10.b, z2.b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
"udot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"udot z19.s, z11.b, z0.b[0]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"udot z23.s, z11.b, z1.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
"udot z27.s, z11.b, z2.b[0]\n"
"udot z31.s, z11.b, z3.b[0]\n"
"zip2 z11.b, z8.b, z9.b\n"
@@ -3414,17 +3437,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"udot z30.s, z14.b, z3.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3446,17 +3469,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"udot z30.s, z10.b, z3.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3478,17 +3501,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z0.b[3]\n"
"udot z21.s, z13.b, z1.b[3]\n"
"udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z2.b[3]\n"
"udot z30.s, z14.b, z3.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3514,17 +3537,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z4.b[0]\n"
"udot z21.s, z9.b, z5.b[0]\n"
"udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z6.b[0]\n"
"udot z30.s, z10.b, z7.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3546,17 +3569,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z4.b[1]\n"
"udot z21.s, z13.b, z5.b[1]\n"
"udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z5.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z6.b[1]\n"
"udot z30.s, z14.b, z7.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3578,17 +3601,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z4.b[2]\n"
"udot z21.s, z9.b, z5.b[2]\n"
"udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z5.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z6.b[2]\n"
"udot z30.s, z10.b, z7.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3610,13 +3633,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z4.b[3]\n"
"udot z21.s, z13.b, z5.b[3]\n"
"udot z25.s, z13.b, z6.b[3]\n"
- "udot z29.s, z13.b, z7.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z7.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[3]\n"
"udot z22.s, z14.b, z5.b[3]\n"
"udot z26.s, z14.b, z6.b[3]\n"
@@ -3626,11 +3649,11 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
"udot z31.s, z15.b, z7.b[3]\n"
- "b.ne 4b\n"
- "3:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
- "cbz %[regs], 5f\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "cbz %[regs], 4f\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -3639,27 +3662,27 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1rqb z5.b, p7/z, [a_ptr1]\n"
"udot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
"udot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"udot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3681,17 +3704,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z1.b[1]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z2.b[1]\n"
"udot z30.s, z14.b, z3.b[1]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3713,17 +3736,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z1.b[2]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z2.b[2]\n"
"udot z30.s, z10.b, z3.b[2]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3745,17 +3768,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z0.b[3]\n"
"udot z21.s, z13.b, z1.b[3]\n"
"udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[3]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[3]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z14.b, z1.b[3]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z14.b, z2.b[3]\n"
"udot z30.s, z14.b, z3.b[3]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3781,17 +3804,17 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z4.b[0]\n"
"udot z21.s, z9.b, z5.b[0]\n"
"udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"zip1 z12.b, z13.b, z14.b\n"
- "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z9.b, z7.b[0]\n"
+ "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z4.b[0]\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"udot z22.s, z10.b, z5.b[0]\n"
+ "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
"udot z26.s, z10.b, z6.b[0]\n"
"udot z30.s, z10.b, z7.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3813,13 +3836,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z4.b[1]\n"
"udot z21.s, z13.b, z5.b[1]\n"
"udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z7.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z4.b[1]\n"
"udot z22.s, z14.b, z5.b[1]\n"
"udot z26.s, z14.b, z6.b[1]\n"
@@ -3841,11 +3864,11 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z4.b[2]\n"
"udot z21.s, z9.b, z5.b[2]\n"
"udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z29.s, z9.b, z7.b[2]\n"
"udot z18.s, z10.b, z4.b[2]\n"
"udot z22.s, z10.b, z5.b[2]\n"
"udot z26.s, z10.b, z6.b[2]\n"
@@ -3870,15 +3893,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
"udot z31.s, z15.b, z7.b[3]\n"
- "cbz %[blocks], 6f\n"
+ "cbz %[blocks], 5f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -3904,15 +3927,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z11.b, z1.b[0]\n"
"udot z27.s, z11.b, z2.b[0]\n"
"udot z31.s, z11.b, z3.b[0]\n"
- "b.eq 7f\n"
+ "b.eq 6f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -3938,13 +3961,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z15.b, z1.b[1]\n"
"udot z27.s, z15.b, z2.b[1]\n"
"udot z31.s, z15.b, z3.b[1]\n"
- "b.eq 8f\n"
+ "b.eq 7f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -3971,31 +3994,31 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z11.b, z1.b[2]\n"
"udot z27.s, z11.b, z2.b[2]\n"
"udot z31.s, z11.b, z3.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 10f\n"
+ "b.eq 9f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 11f\n"
+ "b.eq 10f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "11:\n"
+ "b 11f\n"
+ "10:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 12f\n"
- "10:\n"
+ "b 11f\n"
+ "9:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "12:\n"
+ "11:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4021,33 +4044,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z15.b, z1.b[3]\n"
"udot z27.s, z15.b, z2.b[3]\n"
"udot z31.s, z15.b, z3.b[3]\n"
- "b 9f\n"
- "8:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "7:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 13f\n"
+ "b.eq 12f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 14f\n"
+ "b.eq 13f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "14:\n"
+ "b 14f\n"
+ "13:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 15f\n"
- "13:\n"
+ "b 14f\n"
+ "12:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "15:\n"
+ "14:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4073,33 +4096,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z11.b, z1.b[2]\n"
"udot z27.s, z11.b, z2.b[2]\n"
"udot z31.s, z11.b, z3.b[2]\n"
- "b 9f\n"
- "7:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "6:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 16f\n"
+ "b.eq 15f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 17f\n"
+ "b.eq 16f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "17:\n"
+ "b 17f\n"
+ "16:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 18f\n"
- "16:\n"
+ "b 17f\n"
+ "15:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "18:\n"
+ "17:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4125,33 +4148,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z15.b, z1.b[1]\n"
"udot z27.s, z15.b, z2.b[1]\n"
"udot z31.s, z15.b, z3.b[1]\n"
- "b 9f\n"
- "6:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "5:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 19f\n"
+ "b.eq 18f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 20f\n"
+ "b.eq 19f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "20:\n"
+ "b 20f\n"
+ "19:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 21f\n"
- "19:\n"
+ "b 20f\n"
+ "18:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "21:\n"
+ "20:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4177,8 +4200,8 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z11.b, z1.b[0]\n"
"udot z27.s, z11.b, z2.b[0]\n"
"udot z31.s, z11.b, z3.b[0]\n"
- "b 9f\n"
- "5:\n"
+ "b 8f\n"
+ "4:\n"
"udot z16.s, z8.b, z0.b[0]\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
"udot z20.s, z8.b, z1.b[0]\n"
@@ -4187,27 +4210,27 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"ld1rqb z5.b, p6/z, [a_ptr1]\n"
"udot z28.s, z8.b, z3.b[0]\n"
"ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip2 z8.b, z14.b, z12.b\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr3]\n"
"zip1 z14.b, z14.b, z12.b\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z17.s, z9.b, z0.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z12.b, z13.b, z14.b\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"zip2 z13.b, z13.b, z14.b\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "udot z21.s, z9.b, z1.b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+ "udot z25.s, z9.b, z2.b[0]\n"
"udot z29.s, z9.b, z3.b[0]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z10.b, z0.b[0]\n"
- "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+ "udot z22.s, z10.b, z1.b[0]\n"
"udot z26.s, z10.b, z2.b[0]\n"
"udot z30.s, z10.b, z3.b[0]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -4229,13 +4252,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z13.b, z0.b[1]\n"
"udot z21.s, z13.b, z1.b[1]\n"
"udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"zip1 z8.b, z9.b, z10.b\n"
"zip2 z9.b, z9.b, z10.b\n"
"zip1 z10.b, z11.b, z12.b\n"
"zip2 z11.b, z11.b, z12.b\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"udot z18.s, z14.b, z0.b[1]\n"
"udot z22.s, z14.b, z1.b[1]\n"
"udot z26.s, z14.b, z2.b[1]\n"
@@ -4257,11 +4280,11 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z17.s, z9.b, z0.b[2]\n"
"udot z21.s, z9.b, z1.b[2]\n"
"udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
"zip1 z12.b, z13.b, z14.b\n"
"zip2 z13.b, z13.b, z14.b\n"
"zip1 z14.b, z15.b, z8.b\n"
"zip2 z15.b, z15.b, z8.b\n"
+ "udot z29.s, z9.b, z3.b[2]\n"
"udot z18.s, z10.b, z0.b[2]\n"
"udot z22.s, z10.b, z1.b[2]\n"
"udot z26.s, z10.b, z2.b[2]\n"
@@ -4286,15 +4309,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z15.b, z1.b[3]\n"
"udot z27.s, z15.b, z2.b[3]\n"
"udot z31.s, z15.b, z3.b[3]\n"
- "cbz %[blocks], 22f\n"
+ "cbz %[blocks], 21f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -4320,15 +4343,15 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z11.b, z5.b[0]\n"
"udot z27.s, z11.b, z6.b[0]\n"
"udot z31.s, z11.b, z7.b[0]\n"
- "b.eq 23f\n"
+ "b.eq 22f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
- "subs %[blocks], %[blocks], #0x1\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+ "subs %[blocks], %[blocks], #0x1\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -4354,13 +4377,13 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z15.b, z5.b[1]\n"
"udot z27.s, z15.b, z6.b[1]\n"
"udot z31.s, z15.b, z7.b[1]\n"
- "b.eq 24f\n"
+ "b.eq 23f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
- "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+ "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
@@ -4387,31 +4410,31 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z11.b, z5.b[2]\n"
"udot z27.s, z11.b, z6.b[2]\n"
"udot z31.s, z11.b, z7.b[2]\n"
- "cbz %[odds], 9f\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 25f\n"
+ "b.eq 24f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 26f\n"
+ "b.eq 25f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "26:\n"
+ "b 26f\n"
+ "25:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 27f\n"
- "25:\n"
+ "b 26f\n"
+ "24:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "27:\n"
+ "26:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4437,33 +4460,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z15.b, z5.b[3]\n"
"udot z27.s, z15.b, z6.b[3]\n"
"udot z31.s, z15.b, z7.b[3]\n"
- "b 9f\n"
- "24:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "23:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 28f\n"
+ "b.eq 27f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 29f\n"
+ "b.eq 28f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "29:\n"
+ "b 29f\n"
+ "28:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 30f\n"
- "28:\n"
+ "b 29f\n"
+ "27:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "30:\n"
+ "29:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4489,33 +4512,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z11.b, z5.b[2]\n"
"udot z27.s, z11.b, z6.b[2]\n"
"udot z31.s, z11.b, z7.b[2]\n"
- "b 9f\n"
- "23:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "22:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 31f\n"
+ "b.eq 30f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 32f\n"
+ "b.eq 31f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z13.b, p4/z, [%[b_ptr2]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "32:\n"
+ "b 32f\n"
+ "31:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
"ld1b z14.b, p4/z, [%[b_ptr1]]\n"
- "b 33f\n"
- "31:\n"
+ "b 32f\n"
+ "30:\n"
"mov z13.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z14.b, #0\n"
"ld1b z12.b, p4/z, [%[b_ptr0]]\n"
- "33:\n"
+ "32:\n"
"zip2 z15.b, z12.b, z13.b\n"
"zip1 z13.b, z12.b, z13.b\n"
"mov z12.b, #0\n"
@@ -4541,33 +4564,33 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z15.b, z5.b[1]\n"
"udot z27.s, z15.b, z6.b[1]\n"
"udot z31.s, z15.b, z7.b[1]\n"
- "b 9f\n"
- "22:\n"
- "cbz %[odds], 9f\n"
+ "b 8f\n"
+ "21:\n"
+ "cbz %[odds], 8f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 34f\n"
+ "b.eq 33f\n"
"subs %[odds], %[odds], #0x1\n"
- "b.eq 35f\n"
+ "b.eq 34f\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr2], %[b_ptr2], %[ldb]\n"
- "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+ "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z9.b, p4/z, [%[b_ptr2]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "35:\n"
+ "b 35f\n"
+ "34:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"add %[b_ptr1], %[b_ptr1], %[ldb]\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
"ld1b z10.b, p4/z, [%[b_ptr1]]\n"
- "b 36f\n"
- "34:\n"
+ "b 35f\n"
+ "33:\n"
"mov z9.b, #0\n"
"add %[b_ptr0], %[b_ptr0], %[ldb]\n"
"mov z10.b, #0\n"
"ld1b z8.b, p4/z, [%[b_ptr0]]\n"
- "36:\n"
+ "35:\n"
"zip2 z11.b, z8.b, z9.b\n"
"zip1 z9.b, z8.b, z9.b\n"
"mov z8.b, #0\n"
@@ -4593,7 +4616,7 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
"udot z23.s, z11.b, z5.b[0]\n"
"udot z27.s, z11.b, z6.b[0]\n"
"udot z31.s, z11.b, z7.b[0]\n"
- "9:\n"
+ "8:\n"
"st1w z16.s, p0, [%[c_ptr0]]\n"
"st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
"st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -4618,7 +4641,7 @@ void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int
".unreq c_ptr2\n"
".unreq c_ptr3\n"
: [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
: "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
);
break;