aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-01-23 11:24:50 +0000
committerMichalis Spyrou <michalis.spyrou@arm.com>2019-01-24 10:19:46 +0000
commit1d480652b820317fc97ccbc3cb517e3b9e8be197 (patch)
treeb3c845ec02cccf89430b95186ed3e3f2ae65e2bd
parent20b527a7029d02d0edda78fd92002cbc430dbe05 (diff)
downloadComputeLibrary-1d480652b820317fc97ccbc3cb517e3b9e8be197.tar.gz
COMPMID-1867: Add u8 and s8 hybrid assembly kernels.
Change-Id: Ifeb005f9d18d19feff11949474cce84d9e03749c Reviewed-on: https://review.mlplatform.org/565 Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/assembly/gemm_common.hpp122
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp260
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int8.cpp11
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp13
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_native.hpp91
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp11
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_batched.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp21
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp77
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp2271
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp1605
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp77
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp2271
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp1605
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp25
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp11
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/ndrange.hpp108
-rw-r--r--src/core/NEON/kernels/arm_gemm/utils.hpp5
40 files changed, 8393 insertions, 361 deletions
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
index 7b4f0149e3..c72f210e56 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,42 +34,19 @@ namespace arm_gemm {
// working space (permute as they go along). This interface should support
// all of them.
-template<typename To, typename Tr>
-class GemmCommon {
-protected:
- const To *_Aptr=nullptr;
- int _lda=0;
- int _A_batch_stride=0;
- int _A_multi_stride=0;
- const To *_Bptr=nullptr;
- int _ldb=0;
- int _B_multi_stride=0;
- Tr *_Cptr=nullptr;
- int _ldc=0;
- int _C_batch_stride=0;
- int _C_multi_stride=0;
-
+// The real GemmCommon class is templated based on the operand and return
+// type. This is an interface class which is independent of those types.
+class IGemmCommon {
public:
/* Pass in the pointers to the arrays to be operated on and their
- * strides. This has a default implementation that just captures them
- * all in protected members. If B is pretransposed (see below) then the
- * settings for B here are ignored.
+ * strides. In the interface class these are passed as void pointers -
+ * the templated version overloads this function with a version which
+ * takes appropriately typed pointers. If B is pretransposed (see
+ * below) then the settings for B here are ignored.
*/
- virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
- Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) {
- _Aptr = A;
- _lda = lda;
- _A_batch_stride = A_batch_stride;
- _A_multi_stride = A_multi_stride;
- _Bptr = B;
- _ldb = ldb;
- _B_multi_stride = B_multi_stride;
- _Cptr = C;
- _ldc = ldc;
- _C_batch_stride = C_batch_stride;
- _C_multi_stride = C_multi_stride;
- }
+ virtual void set_arrays(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
+ const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
+ void *C, const int ldc, const int C_batch_stride, const int C_multi_stride) = 0;
/* For threading, we divide the work into some number of units and work
* out internally what unit corresponds to what work. This returns the
@@ -90,6 +67,9 @@ public:
*/
virtual void set_nthreads(int) { };
+ /* Whether this GEMM can be dynamically scheduled or not. */
+ virtual bool supports_dynamic_scheduling() const { return false; }
+
/* Actually do the work. Provide a threadid to index any per-thread
* buffers, and a start/end range to indicate which work to do. */
virtual void execute(unsigned int, unsigned int, int) = 0;
@@ -107,14 +87,78 @@ public:
virtual bool B_pretranspose_required() const { return false; }
/* Total number of bytes of space needed for pretransposed arrays. */
virtual size_t get_B_pretransposed_array_size() const { return 0; }
- /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
- /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
- virtual void pretranspose_B_array(void *, const To *, const int, const int) { };
+ /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
+ /* The "real" version of this depends on the templated operand type (see below). */
+ virtual void pretranspose_B_array(void *, const void *, const int, const int) = 0;
/* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
virtual void set_pretransposed_B_data(void *) { }
// Destructor
- virtual ~GemmCommon() { }
+ virtual ~IGemmCommon() { }
+};
+
+/*
+ * "Real" GemmCommon class which is templated on the operand and return types.
+ *
+ * In addition to correctly typed versions of the functions that operate on
+ * operand and return data, this class provides a default implementation of
+ * 'set_arrays' to capture the provided arguments in protected class
+ * members, as essentially any implementation will need these.
+ */
+template<typename To, typename Tr>
+class GemmCommon : public IGemmCommon {
+protected:
+ const To *_Aptr=nullptr;
+ int _lda=0;
+ int _A_batch_stride=0;
+ int _A_multi_stride=0;
+ const To *_Bptr=nullptr;
+ int _ldb=0;
+ int _B_multi_stride=0;
+ Tr *_Cptr=nullptr;
+ int _ldc=0;
+ int _C_batch_stride=0;
+ int _C_multi_stride=0;
+
+public:
+ /* Pass in the pointers to the arrays to be operated on and their
+ * strides (templated version with appropriate types). */
+ virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
+ const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
+ Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) {
+ _Aptr = A;
+ _lda = lda;
+ _A_batch_stride = A_batch_stride;
+ _A_multi_stride = A_multi_stride;
+ _Bptr = B;
+ _ldb = ldb;
+ _B_multi_stride = B_multi_stride;
+ _Cptr = C;
+ _ldc = ldc;
+ _C_batch_stride = C_batch_stride;
+ _C_multi_stride = C_multi_stride;
+ }
+
+ /* Implementation of the void * overload which casts its arguments to the appropriate type. */
+ void set_arrays(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
+ const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
+ void *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
+ set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
+ static_cast<const To *>(B), ldb, B_multi_stride,
+ static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride);
+ }
+
+ /*** "Pretransposed" interface ***/
+
+ /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
+ /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
+ virtual void pretranspose_B_array(void *, const To *, const int, const int) { };
+
+ /* Implementation of the void * overload which casts its arguments to the appropriate type. */
+ void pretranspose_B_array(void *out, const void *in, const int row_stride, const int multi_stride) override {
+ pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
+ }
+
};
-} // namespace arm_gemm
+} // namespace arm_gemm \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index 09f03c6332..c2bd0bb882 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -28,6 +28,7 @@
#include <algorithm>
#include "arm_gemm.hpp"
+#include "ndrange.hpp"
#include "utils.hpp"
#include "mergeresults.hpp"
@@ -60,69 +61,66 @@ class GemmHybrid : public GemmCommon<To, Tr> {
const Tr _beta;
/* Blocking info */
- unsigned int _k_block=0;
- unsigned int _x_block=0;
- unsigned int _Mround=0;
+ const unsigned int _k_block;
+ const unsigned int _n_block;
+ const unsigned int _Mround;
/* Pretransposed buffer. */
const Toi *_B_transposed=nullptr;
- unsigned int _B_per_multi = 0;
+ const NDRange<4> _window_range;
- /* We will need to walk through the blocks of B in a few contexts, so
- * factor that out. */
- class blockwalker {
- private:
- /* Size loops, etc. based on our parent's configuration */
- const GemmHybrid<strategy, To, Tr> &_parent;
+ static unsigned int compute_k_block(const GemmArgs<Tr> &args) {
+ if (args._cfg && args._cfg->inner_block_size) {
+ return args._cfg->inner_block_size;
+ }
- /* K, X and multi parameters for current iteration. */
- unsigned int _k0=0, _x0=0;
+ const unsigned int L1_size = args._ci->get_L1_cache_size();
- unsigned int _index=0;
- bool _done=false;
- bool _newkblock=true;
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
- public:
- blockwalker(const GemmHybrid<strategy, To, Tr> &parent) : _parent(parent) { }
+ // Needs to be (at least a single) multiple of the K unroll level.
+ k_block /= strategy::k_unroll();
+ k_block = std::max(k_block, 1U) * strategy::k_unroll();
- unsigned int xmax() {
- return std::min(_x0 + _parent._x_block, _parent._Nsize);
- }
+ // Now tune to presented problem size; this is how many blocks we need.
+ unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
- unsigned int kmax() {
- return std::min(_k0 + _parent._k_block, _parent._Ksize);
- }
+ // So divide the space equally into that many blocks.
+ k_block = iceildiv(args._Ksize, numk_blocks);
- /* Advance to the next block, return false at the end. */
- bool advance(void) {
- if (_done) {
- return false;
- }
+ // And round UP to the K unroll level required.
+ k_block = roundup(k_block, strategy::k_unroll());
- _newkblock=false;
- _x0 += _parent._x_block;
- if (_x0 >= _parent._Nsize) {
- _x0=0;
- _k0 += _parent._k_block;
- if (_k0 >= _parent._Ksize) {
- _done=true;
- return false;
- }
- _newkblock=true;
- }
- _index++;
+ return k_block;
+ }
- return true;
+ static unsigned int compute_n_block(const GemmArgs<Tr> &args) {
+ if (args._cfg && args._cfg->outer_block_size) {
+ return args._cfg->outer_block_size;
}
- unsigned int k0(void) { return _k0; }
- unsigned int x0(void) { return _x0; }
- unsigned int index(void) { return _index; }
- bool done(void) { return _done; }
- bool newkblock(void) { return _newkblock; }
- };
+ const unsigned int k_block = compute_k_block(args);
+ const unsigned int L2_size = args._ci->get_L2_cache_size();
+ // n_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+ (sizeof(Toi) * k_block);
+
+ // Needs to be (at least a single) multiple of the kernel output width.
+ n_block /= strategy::out_width();
+ n_block = std::max(n_block, 1U) * strategy::out_width();
+
+ // And tune to the presented problem size.
+ unsigned int numblocks = iceildiv(args._Nsize, n_block);
+ n_block = iceildiv(args._Nsize, numblocks);
+ n_block = roundup(n_block, strategy::out_width());
+
+ return n_block;
+ }
public:
GemmHybrid(GemmHybrid &) = delete;
@@ -130,71 +128,20 @@ public:
/* Constructor */
GemmHybrid(const GemmArgs<Tr> &args)
- : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches),
- _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta) {
- const unsigned int L1_size = _ci->get_L1_cache_size();
- const unsigned int L2_size = _ci->get_L2_cache_size();
-
- _B_per_multi = (iceildiv(_Nsize, strategy::out_width()) * strategy::out_width()) *
- (iceildiv(_Ksize, strategy::k_unroll()) * strategy::k_unroll());
-
- // Work out blocking parameters, or override from config.
-
- if (args._cfg && args._cfg->inner_block_size) {
- _k_block = args._cfg->inner_block_size;
- } else {
- // k_block: Find out how much of the larger array can be loaded into half the cache.
- // This should account for associative caches.
- _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
- // Needs to be (at least a single) multiple of the K unroll level.
- _k_block /= strategy::k_unroll();
- _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
-
- // Now tune to presented problem size; this is how many blocks we need.
- int num_k_blocks = iceildiv(_Ksize, _k_block);
-
- // So divide the space equally into that many blocks.
- _k_block = iceildiv(_Ksize, num_k_blocks);
-
- // And round UP to the K unroll level required.
- _k_block = iceildiv(_k_block, strategy::k_unroll());
- _k_block *= strategy::k_unroll();
- }
-
- if (args._cfg && args._cfg->outer_block_size) {
- _x_block = args._cfg->outer_block_size;
- } else {
- // x_block: Work out how many rows (of length k_block) will fit in the L2
- // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
- _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
- (sizeof(Toi) * _k_block);
-
- // Needs to be (at least a single) multiple of the kernel output width.
- _x_block /= strategy::out_width();
- _x_block = std::max(_x_block, 1U) * strategy::out_width();
-
- // And tune to the presented problem size.
- int num_x_blocks = iceildiv(_Nsize, _x_block);
- _x_block = iceildiv(_Nsize, num_x_blocks);
-
- _x_block = iceildiv(_x_block, strategy::out_width());
- _x_block *= strategy::out_width();
- }
-
- // Work out the rounded size of M - needed for some buffers.
- _Mround = iceildiv(_Msize, strategy::out_height());
- _Mround *= strategy::out_height();
- }
+ : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+ _nbatches(args._nbatches), _nmulti(args._nmulti), _trB(args._trB), _beta(args._beta),
+ _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+ _Mround(roundup(args._Msize, strategy::out_height())),
+ _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { }
// Interface implementation - Compulsory functions
-
- // Window size: Only the last thread should do a ragged block, so dole
- // out work in units of out_height. Factor batches and multi into the
- // window too.
unsigned int get_window_size() const override {
- // _Mround is a multiple of out_height by definition.
- return (_Mround / strategy::out_height()) * _nbatches * _nmulti;
+ return _window_range.total_size();
+ }
+
+ // This kernel can always be dynamically scheduled.
+ bool supports_dynamic_scheduling() const override {
+ return true;
}
// Execute
@@ -206,50 +153,45 @@ public:
/* Make sure we've been set up correctly. */
assert(_B_transposed);
-
- const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height());
- const unsigned int window_per_multi = window_per_batch * _nbatches;
-
- const unsigned int first_multi = start / window_per_multi;
- const unsigned int last_multi = end / window_per_multi;
-
- const unsigned int first_batch = (start - (first_multi * window_per_multi)) / window_per_batch;
- const unsigned int last_batch = (end - (last_multi * window_per_multi)) / window_per_batch;
-
- const unsigned int first_row = ((start - (first_multi * window_per_multi)) % window_per_batch) * strategy::out_height();
- const unsigned int last_row = ((end - (last_multi * window_per_multi)) % window_per_batch) * strategy::out_height();
-
static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
- for (unsigned int multi = first_multi; multi <= last_multi; multi++) {
- const unsigned int batch_0 = (multi == first_multi) ? first_batch : 0;
- const unsigned int batch_max = (multi == last_multi) ? last_batch : (_nbatches - 1);
+ /* For now, each work item implies all the K for a given output
+ * pixel (so we don't need to synchronize access to the output
+ * array). So separate the loop over K blocks here. */
+ for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+ unsigned int kmax = std::min(k0 + _k_block, _Ksize);
+ unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
- const Toi *b_panel = _B_transposed + (multi * _B_per_multi);
+ auto p = _window_range.iterator(start, end);
- for (blockwalker current(*this); !current.done(); current.advance()) {
- int kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
- kern_k *= strat.k_unroll();
+ if (p.done()) {
+ return;
+ }
- int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
+ do {
+ const unsigned int m_start = p.dim(0) * strategy::out_height();
+ const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize);
+ const unsigned int batch = p.dim(1);
+ const unsigned int n0 = p.dim(2) * _n_block;
+ const unsigned int nmax = std::min(n0 + _n_block, _Nsize);
+ const unsigned int multi = p.dim(3);
+
+ const Toi *b_panel = _B_transposed +
+ (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) +
+ (k0 * roundup(_Nsize, strategy::out_width())) +
+ (n0 * kern_k);
- for (unsigned int batch = batch_0; batch <= batch_max; batch++) {
- const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0;
- const unsigned int m_end = ((multi == last_multi) && (batch == last_batch) ) ? last_row : _Msize;
#ifdef CYCLE_PROFILING
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * bblocks * strategy::out_width());
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
#endif
- strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + current.k0(), this->_lda,
- b_panel,
- this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + current.x0(), this->_ldc,
- (current.k0() == 0) ? _beta : static_cast<Tr>(1),
- (m_end - m_start), (current.xmax() - current.x0()), kern_k);
- }
-
- b_panel += (bblocks * strat.out_width() * kern_k);
- }
+ strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
+ b_panel,
+ this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
+ (k0 == 0) ? _beta : static_cast<Tr>(1),
+ (m_end - m_start), (nmax - n0), kern_k);
+ } while (p.next_dim1());
}
}
@@ -263,35 +205,31 @@ public:
}
size_t get_B_pretransposed_array_size() const override {
- return _B_per_multi * _nmulti * sizeof(Toi);
+ return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
}
+ using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
_B_transposed = buffer;
strategy strat(_ci);
- for (unsigned int multi=0; multi < _nmulti; multi++) {
- blockwalker current(*this);
-
- do {
- /* Figure out the size of each block. */
- size_t x_size = (current.xmax() - current.x0());
- size_t k_size = (current.kmax() - current.k0());
+ for (unsigned int multi=0; multi<_nmulti; multi++) {
+ for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+ const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
+ const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll());
- /* Round sizes up as needed. */
- x_size = iceildiv(x_size, strategy::out_width());
- x_size *= strategy::out_width();
+ for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
+ const unsigned int xmax = std::min(x0+_n_block, _Nsize);
- k_size = iceildiv(k_size, strategy::k_unroll());
- k_size *= strategy::k_unroll();
+ const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
- strat.transforms.PrepareB(
- buffer, B + (multi * B_multi_stride), ldb,
- current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
+ strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
+ x0, xmax, k0, kmax, _trB);
- buffer += (x_size * k_size);
- } while (current.advance());
+ buffer += size;
+ }
+ }
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 34dc8bc341..5811c2a1ce 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#include "arm_gemm.hpp"
#include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
#include "gemm_native.hpp"
@@ -32,6 +33,7 @@
#include "kernels/a64_gemm_s16_12x8.hpp"
#include "kernels/a64_gemm_s8_12x8.hpp"
#include "kernels/a64_gemm_s8_4x4.hpp"
+#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
#include "kernels/sve_native_s8s32_dot_4VLx4.hpp"
@@ -55,6 +57,13 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
},
#endif
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_s8s32_dot_16x4",
+ [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<int32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
+ [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
+},
+{
GemmMethod::GEMM_INTERLEAVED,
"gemm_s8_12x8",
[](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod(); },
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 436438f351..b83ccd3407 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -340,7 +340,7 @@ public:
_k_block = std::max(_k_block, 1U) * strategy::k_unroll();
// Now tune to presented problem size; this is how many blocks we need.
- int num_k_blocks = iceildiv(_Ksize, _k_block);
+ unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
// So divide the space equally into that many blocks.
_k_block = iceildiv(_Ksize, num_k_blocks);
@@ -363,7 +363,7 @@ public:
_x_block = std::max(_x_block, 1U) * strategy::out_width();
// And tune to the presented problem size.
- int num_x_blocks = iceildiv(_Nsize, _x_block);
+ unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
_x_block = iceildiv(_Nsize, num_x_blocks);
_x_block = iceildiv(_x_block, strategy::out_width());
@@ -464,8 +464,8 @@ public:
do {
/* Figure out the size of each block. */
- size_t x_size = (current.xmax() - current.x0());
- size_t k_size = (current.kmax() - current.k0());
+ unsigned int x_size = (current.xmax() - current.x0());
+ unsigned int k_size = (current.kmax() - current.k0());
/* Round sizes up as needed. */
x_size = iceildiv(x_size, strategy::out_width());
@@ -480,6 +480,7 @@ public:
return total;
}
+ using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
blockwalker current(*this);
Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
@@ -488,8 +489,8 @@ public:
do {
/* Figure out the size of each block. */
- size_t x_size = (current.xmax() - current.x0());
- size_t k_size = (current.kmax() - current.k0());
+ unsigned int x_size = (current.xmax() - current.x0());
+ unsigned int k_size = (current.kmax() - current.k0());
/* Round sizes up as needed. */
x_size = iceildiv(x_size, strategy::out_width());
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index 579533418d..98516b1ca6 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,8 +27,7 @@
#include "arm_gemm.hpp"
-#include "mergeresults.hpp"
-#include "transform.hpp"
+#include "ndrange.hpp"
#ifdef CYCLE_PROFILING
#include "profiler.hpp"
@@ -55,19 +54,25 @@ class GemmNative : public GemmCommon<To, Tr> {
const unsigned int _nbatches;
const unsigned int _nmultis;
- Tr _beta;
+ const Tr _beta;
const CPUInfo * const _ci;
- unsigned int k_block=0;
- unsigned int n_block=0;
+ const unsigned int _k_block;
+ const unsigned int _n_block;
- unsigned int window_per_batch() const {
- return iceildiv(_Msize, strategy::out_height());
+ const NDRange<4> _window_range;
+
+ static unsigned int compute_k_block(const GemmArgs<Tr> &args) {
+ return args._Ksize;
}
- unsigned int window_per_multi() const {
- return window_per_batch() * _nbatches;
+ static unsigned int compute_n_block(const GemmArgs<Tr> &args) {
+ if ((args._cfg != nullptr) && args._cfg->outer_block_size > 0) {
+ return args._cfg->outer_block_size;
+ } else {
+ return args._Nsize;
+ }
}
public:
@@ -75,15 +80,20 @@ public:
GemmNative & operator= (GemmNative &) = delete;
GemmNative(const GemmArgs<Tr> &args)
- : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), _nbatches(args._nbatches), _nmultis(args._nmulti), _beta(args._beta), _ci(args._ci) {
- /* For now don't do any blocking. TODO: figure out if we should. */
- k_block = _Ksize;
- n_block = _Nsize;
- }
+ : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+ _nbatches(args._nbatches), _nmultis(args._nmulti),
+ _beta(args._beta), _ci(args._ci),
+ _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+ _window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { }
// Window is amount per multi multiplied by total number of multis.
unsigned int get_window_size() const override {
- return window_per_multi() * _nmultis;
+ return _window_range.total_size();
+ }
+
+ // Native GEMMs can always be dynamically scheduled (whether requested or not)
+ bool supports_dynamic_scheduling() const override {
+ return true;
}
// Actually execute the GEMM.
@@ -96,45 +106,30 @@ public:
static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
- /* Compute starting point based on 'start' */
- unsigned int multi = start / window_per_multi();
- unsigned int multi_pos = start % window_per_multi();
+ auto p = _window_range.iterator(start, end);
- unsigned int batch = multi_pos / window_per_batch();
- unsigned int batch_pos = multi_pos % window_per_batch();
-
- unsigned int y0 = batch_pos * strategy::out_height();
-
- for (unsigned int l=end-start; l>0; ) {
- // Do work from here to the end of the current batch/multi
- const unsigned int ymax = std::min(y0 + (l * strategy::out_height()), _Msize);
+ if (p.done()) {
+ return;
+ }
- // Work out how many units this is and subtract from loop counter.
- l -= ((ymax - y0) + (strategy::out_height() - 1)) / strategy::out_height();
+ do {
+ unsigned int y0 = p.dim(0) * strategy::out_height();
+ unsigned int ymax = std::min(p.dim0_max() * strategy::out_height(), _Msize);
+ unsigned int batch = p.dim(1);
+ unsigned int n0 = p.dim(2) * _n_block;
+ unsigned int nmax = std::min(n0 + _n_block, _Nsize);
+ unsigned int multi = p.dim(3);
#ifdef CYCLE_PROFILING
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * _Nsize * _Ksize);
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * (nmax - n0) * _Ksize);
#endif
strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (y0 * this->_lda), this->_lda,
- this->_Bptr + (multi * this->_B_multi_stride), this->_ldb,
- this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc), this->_ldc,
- _beta, (ymax-y0), _Nsize, _Ksize);
-
- /* Advance to next item */
- y0 = ymax;
-
- /* Check for batch/multi overflow */
- if (y0 >= _Msize) {
- y0=0;
- batch++;
- if (batch == _nbatches) {
- batch=0;
- multi++;
- }
- }
- }
+ this->_Bptr + (multi * this->_B_multi_stride) + n0, this->_ldb,
+ this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc) + n0, this->_ldc,
+ _beta, (ymax-y0), (nmax - n0), _Ksize);
+ } while (p.next_dim1());
}
};
-} // namespace arm_gemm
+} // namespace arm_gemm \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 3c8df3f044..b95ca8016b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,11 +27,13 @@
#include "gemm_common.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
+#include "gemm_hybrid.hpp"
#include "gemm_native.hpp"
#include "kernels/a64_gemm_u16_12x8.hpp"
#include "kernels/a64_gemm_u8_12x8.hpp"
#include "kernels/a64_gemm_u8_4x4.hpp"
+#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
#include "kernels/sve_native_u8u32_dot_4VLx4.hpp"
@@ -55,6 +57,13 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
},
#endif
{
+ GemmMethod::GEMM_HYBRID,
+ "hybrid_u8u32_dot_16x4",
+ [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+ [](const GemmArgs<uint32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
+ [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
+},
+{
GemmMethod::GEMM_INTERLEAVED,
"gemm_u8_12x8",
[](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod(); },
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index 40f7f2b7cd..32d668f66d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -44,6 +44,7 @@ public:
_subgemm = gemm<To,Tr>(newargs);
}
+ using GemmCommon<To, Tr>::set_arrays;
void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
const To *B, const int ldb, const int B_multi_stride,
Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
@@ -85,6 +86,7 @@ public:
return _subgemm->get_B_pretransposed_array_size();
}
+ using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
_subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index 5cf42761e6..5ebc6342d7 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -73,7 +73,7 @@ public:
// Window is number of out_width blocks times number of multis.
unsigned int get_window_size() const override {
- return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+ return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
}
// Actually execute the GEMV.
@@ -83,12 +83,12 @@ public:
#endif
strategy strat(_ci);
- const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+ const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
const unsigned int multi_0 = start / window_per_multi;
const unsigned int multi_end = end / window_per_multi;
- const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width;
- const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+ const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width();
+ const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width();
static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 842339ef23..f7beb0a34c 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -70,7 +70,7 @@ public:
GemvPretransposed(const GemmArgs<Tr> &args)
: _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _trB(args._trB), _beta(args._beta), _ci(args._ci),
- _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) {
+ _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) {
/* For now don't do any blocking. TODO: figure out if we should. */
if (args._cfg && args._cfg->inner_block_size) {
m_block = args._cfg->inner_block_size;
@@ -87,7 +87,7 @@ public:
// Window is number of out_width blocks, times number of multis.
unsigned int get_window_size() const override {
- return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+ return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
}
// Actually execute the GEMV.
@@ -98,13 +98,13 @@ public:
strategy strat(_ci);
/* Break the window values down into multis of interest... */
- const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+ const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
const unsigned int multi_0 = start / window_per_multi;
const unsigned int multi_end = end / window_per_multi;
/* ... and figure out where we start and end in the first and last multi. */
- const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width;
- const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+ const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width();
+ const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width();
static_assert(std::is_same<Tr, Tri>::value, "GemvPretransposed: Result types must be the same.");
@@ -124,8 +124,8 @@ public:
auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n));
#endif
/* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
- strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave),
- (_Ksize * strategy::A_interleave),
+ strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave()),
+ (_Ksize * strategy::A_interleave()),
this->_Aptr + (multi * this->_A_multi_stride) + m0,
this->_Cptr + (multi * this->_C_multi_stride) + n,
_beta, (mmax-m0), (nmax-n));
@@ -148,6 +148,7 @@ public:
return _buffer_per_multi * _nmultis * sizeof(To);
}
+ using GemmCommon<To, Tr>::pretranspose_B_array;
void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
@@ -155,10 +156,10 @@ public:
/* Reverse sense here as we are dealing with B rather than A. So if
* strategy::A_transpose is false and _trB is false, we still
* transpose. */
- if (_trB ^ strategy::A_transpose) {
- Transform<strategy::A_interleave, strategy::A_block, false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+ if (_trB ^ strategy::A_transpose()) {
+ Transform<strategy::A_interleave(), strategy::A_block(), false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
} else {
- Transform<strategy::A_interleave, strategy::A_block, true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+ Transform<strategy::A_interleave(), strategy::A_block(), true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
index 06e62456dc..234972270c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,15 +50,15 @@ public:
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 8;
}
- static int out_height() {
+ static unsigned int out_height() {
return 6;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
index 95a2bc2fbc..2fcb587df1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,15 +48,15 @@ public:
typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
index fdc0200435..cc205dc6e3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,15 +43,15 @@ public:
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index be7ead9f48..71c666ad00 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,15 +42,15 @@ public:
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 4;
}
- static int out_height() {
+ static unsigned int out_height() {
return 4;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 16;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
index d2692ba77f..3d5c92c622 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,15 +48,15 @@ public:
typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
index a252abfd3e..9032ba67b3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,15 +53,15 @@ public:
static const bool B_transpose = true;
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 2da3ecd4f8..fda7657b2b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,15 +50,15 @@ public:
static const bool B_transpose = true;
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 4;
}
- static int out_height() {
+ static unsigned int out_height() {
return 4;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 16;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
index 911a4ebb01..5b850b7a20 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,15 +47,15 @@ public:
typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 24;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
new file mode 100644
index 0000000000..c8934dff8a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+class hybrid_s8s32_dot_16x4
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_s8s32_dot_16x4;
+
+ hybrid_s8s32_dot_16x4(const CPUInfo *ci)
+ {
+ if (ci->get_cpu_model() == CPUModel::A55r1) {
+ kernel = a64_hybrid_s8s32_dot_16x4_a55;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
new file mode 100644
index 0000000000..48bf842ca5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
@@ -0,0 +1,2271 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+
+ for (int y=0; y<M; y+=4) {
+ const int8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(int8_t);
+
+ int32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(int32_t);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const int32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ const int8_t *a_ptr0 = a_ptr0_base;
+ const int8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d2, [a_ptr2, #0x10]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #0x18]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d2, [a_ptr2, #0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x18]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr d3, [a_ptr3, #0x10]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x18]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
new file mode 100644
index 0000000000..01791391c8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
@@ -0,0 +1,1605 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+
+ for (int y=0; y<M; y+=4) {
+ const int8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(int8_t);
+
+ int32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(int32_t);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const int32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ const int8_t *a_ptr0 = a_ptr0_base;
+ const int8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #0x10]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #0x10]\n"
+ ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
new file mode 100644
index 0000000000..7fb9b5c131
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <cstdint>
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+class hybrid_u8u32_dot_16x4
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+ /* Kernel blocking parameters */
+ static unsigned int out_height()
+ {
+ return 4;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_u8u32_dot_16x4;
+
+ hybrid_u8u32_dot_16x4(const CPUInfo *ci)
+ {
+ if (ci->get_cpu_model() == CPUModel::A55r1) {
+ kernel = a64_hybrid_u8u32_dot_16x4_a55;
+ }
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
new file mode 100644
index 0000000000..230ecdce2d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
@@ -0,0 +1,2271 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0u);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+
+ for (int y=0; y<M; y+=4) {
+ const uint8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(uint8_t);
+
+ uint32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const uint32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ const uint8_t *a_ptr0 = a_ptr0_base;
+ const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "temploadreg0 .req X0\n"
+ "temploadreg1 .req X1\n"
+ "temploadreg2 .req X2\n"
+ "temploadreg3 .req X3\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ins v4.d[1], temploadreg0\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "temploadreg0 .req X2\n"
+ "temploadreg1 .req X3\n"
+ "temploadreg2 .req X4\n"
+ "temploadreg3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ins v14.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ "ins v1.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "temploadreg0 .req X4\n"
+ "temploadreg1 .req X5\n"
+ "temploadreg2 .req X6\n"
+ "temploadreg3 .req X7\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ins v14.d[1], temploadreg2\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ "ins v15.d[1], temploadreg3\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ "ins v9.d[1], temploadreg1\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ "ins v10.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ "ins v11.d[1], temploadreg3\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d2, [a_ptr2, #0x10]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #0x18]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ "ins v2.d[1], temploadreg2\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "temploadreg0 .req X6\n"
+ "temploadreg1 .req X7\n"
+ "temploadreg2 .req X8\n"
+ "temploadreg3 .req X9\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "movi v28.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v29.4s, #0\n"
+ "ins v14.d[1], temploadreg2\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ins v14.d[1], temploadreg2\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr d0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr d1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr d2, [a_ptr2, #-0x10]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr d3, [a_ptr3, #-0x10]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ "ins v13.d[1], temploadreg1\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ "ins v14.d[1], temploadreg2\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "ins v15.d[1], temploadreg3\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr d8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr d9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr d10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr d11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr d12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr d13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr d14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr d0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr d1, [a_ptr1, #0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr temploadreg1, [a_ptr1, #0x18]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr d15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr d2, [a_ptr2, #0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x18]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr d3, [a_ptr3, #0x10]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x18]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ins v0.d[1], temploadreg0\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ins v1.d[1], temploadreg1\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ins v2.d[1], temploadreg2\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ins v3.d[1], temploadreg3\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr d4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr d5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr temploadreg1, [a_ptr1, #0x8]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr d6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr temploadreg2, [a_ptr2, #0x8]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr d7, [a_ptr3]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr temploadreg3, [a_ptr3, #0x8]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr d8, [%[b_ptr0]]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ins v4.d[1], temploadreg0\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr d9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ins v5.d[1], temploadreg1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr d10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ins v6.d[1], temploadreg2\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr d11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ins v7.d[1], temploadreg3\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr d12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ins v8.d[1], temploadreg0\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr d13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ins v9.d[1], temploadreg1\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ins v10.d[1], temploadreg2\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr d14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ins v11.d[1], temploadreg3\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr d15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ins v12.d[1], temploadreg0\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ins v13.d[1], temploadreg1\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ins v14.d[1], temploadreg2\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ins v15.d[1], temploadreg3\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ ".unreq temploadreg0\n"
+ ".unreq temploadreg1\n"
+ ".unreq temploadreg2\n"
+ ".unreq temploadreg3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
new file mode 100644
index 0000000000..dbef02985f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
@@ -0,0 +1,1605 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+ const long beta0 = (beta == 0u);
+ const int K_stride = ((K + 3) / 4) * 4;
+ const long loops_count = ((K + 16) / 32) - 1;
+ K -= loops_count * 32;
+ const long regs_count = (K / 16) - 1;
+
+ for (int y=0; y<M; y+=4) {
+ const uint8_t * const a_ptr0_base = A + (y * lda);
+ const unsigned long ldab = lda * sizeof(uint8_t);
+
+ uint32_t *c_ptr0 = C + (y * ldc);
+ const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+ for (int x0=0; x0<N; x0+=16ul) {
+ const long width = std::min((unsigned long)N-x0, 16ul);
+ const uint32_t *betaptr = &beta;
+ long loops = loops_count;
+ long regs = regs_count;
+ const uint8_t *a_ptr0 = a_ptr0_base;
+ const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+ switch(M-y) {
+ case 1:
+ __asm __volatile (
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v18.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v19.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ );
+ break;
+ case 2:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "c_ptr1 .req X1\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v19.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v20.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v21.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v22.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v23.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq c_ptr1\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+ );
+ break;
+ case 3:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "c_ptr1 .req X2\n"
+ "c_ptr2 .req X3\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v20.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v21.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v22.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v23.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v24.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v25.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v26.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v27.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q2, [a_ptr2, #0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+ );
+ break;
+ default:
+ case 4:
+ __asm __volatile (
+ "a_ptr1 .req X0\n"
+ "a_ptr2 .req X1\n"
+ "a_ptr3 .req X2\n"
+ "c_ptr1 .req X3\n"
+ "c_ptr2 .req X4\n"
+ "c_ptr3 .req X5\n"
+ "add a_ptr1, %[a_ptr0], %[lda]\n"
+ "add c_ptr1, %[c_ptr0], %[ldc]\n"
+ "add a_ptr2, a_ptr1, %[lda]\n"
+ "add c_ptr2, c_ptr1, %[ldc]\n"
+ "add a_ptr3, a_ptr2, %[lda]\n"
+ "add c_ptr3, c_ptr2, %[ldc]\n"
+ "cbz %[beta0], 1f\n"
+ "movi v16.4s, #0\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "movi v17.4s, #0\n"
+ "ldr q1, [a_ptr1]\n"
+ "movi v18.4s, #0\n"
+ "ldr q2, [a_ptr2]\n"
+ "movi v19.4s, #0\n"
+ "ldr q3, [a_ptr3]\n"
+ "movi v20.4s, #0\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "movi v21.4s, #0\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "movi v22.4s, #0\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "movi v23.4s, #0\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "movi v24.4s, #0\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "movi v25.4s, #0\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "movi v26.4s, #0\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "movi v27.4s, #0\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "movi v28.4s, #0\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "movi v29.4s, #0\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "movi v30.4s, #0\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "movi v31.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "b 3f\n"
+ "1:\n"
+ "ld1r {v15.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #0x10]\n"
+ "ldr q18, [%[c_ptr0], #0x20]\n"
+ "ldr q19, [%[c_ptr0], #0x30]\n"
+ "mul v16.4s, v16.4s, v15.4s\n"
+ "ldr q20, [c_ptr1]\n"
+ "mul v17.4s, v17.4s, v15.4s\n"
+ "ldr q21, [c_ptr1, #0x10]\n"
+ "mul v18.4s, v18.4s, v15.4s\n"
+ "ldr q22, [c_ptr1, #0x20]\n"
+ "mul v19.4s, v19.4s, v15.4s\n"
+ "ldr q23, [c_ptr1, #0x30]\n"
+ "mul v20.4s, v20.4s, v15.4s\n"
+ "ldr q24, [c_ptr2]\n"
+ "mul v21.4s, v21.4s, v15.4s\n"
+ "ldr q25, [c_ptr2, #0x10]\n"
+ "mul v22.4s, v22.4s, v15.4s\n"
+ "ldr q26, [c_ptr2, #0x20]\n"
+ "mul v23.4s, v23.4s, v15.4s\n"
+ "ldr q27, [c_ptr2, #0x30]\n"
+ "mul v24.4s, v24.4s, v15.4s\n"
+ "ldr q28, [c_ptr3]\n"
+ "mul v25.4s, v25.4s, v15.4s\n"
+ "ldr q29, [c_ptr3, #0x10]\n"
+ "mul v26.4s, v26.4s, v15.4s\n"
+ "ldr q30, [c_ptr3, #0x20]\n"
+ "mul v27.4s, v27.4s, v15.4s\n"
+ "ldr q31, [c_ptr3, #0x30]\n"
+ "mul v28.4s, v28.4s, v15.4s\n"
+ "ldr q0, [%[a_ptr0]]\n"
+ "mul v29.4s, v29.4s, v15.4s\n"
+ "ldr q1, [a_ptr1]\n"
+ "mul v30.4s, v30.4s, v15.4s\n"
+ "ldr q2, [a_ptr2]\n"
+ "mul v31.4s, v31.4s, v15.4s\n"
+ "ldr q3, [a_ptr3]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x10\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ "add a_ptr1, a_ptr1, #0x10\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ "add a_ptr2, a_ptr2, #0x10\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ "add a_ptr3, a_ptr3, #0x10\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 2f\n"
+ "3:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ "add %[a_ptr0], %[a_ptr0], #0x20\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ "add a_ptr1, a_ptr1, #0x20\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ "add a_ptr2, a_ptr2, #0x20\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ "add a_ptr3, a_ptr3, #0x20\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #-0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #-0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #-0x10]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #-0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b.ne 3b\n"
+ "2:\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2]\n"
+ "prfm PSTL1KEEP, [c_ptr3]\n"
+ "cbz %[regs], 4f\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x100\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ "ldr q8, [%[b_ptr0], #-0x80]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ "ldr q9, [%[b_ptr0], #-0x70]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ "ldr q10, [%[b_ptr0], #-0x60]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ "ldr q11, [%[b_ptr0], #-0x50]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ "ldr q12, [%[b_ptr0], #-0x40]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ "ldr q13, [%[b_ptr0], #-0x30]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ "ldr q14, [%[b_ptr0], #-0x20]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ "ldr q0, [%[a_ptr0], #0x10]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ "ldr q1, [a_ptr1, #0x10]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ "ldr q2, [a_ptr2, #0x10]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "ldr q15, [%[b_ptr0], #-0x10]\n"
+ ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+ "ldr q3, [a_ptr3, #0x10]\n"
+ ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+ ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+ ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+ ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+ ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+ ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+ ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+ ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+ ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+ ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+ ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+ ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+ ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+ ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+ ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+ ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+ ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+ ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+ ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+ ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+ ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+ ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+ ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+ ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+ ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+ ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+ ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+ ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+ ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+ ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+ ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+ ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+ ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+ ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+ ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+ ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+ ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+ ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+ ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+ ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+ ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+ "b 5f\n"
+ "4:\n"
+ ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q4, [%[a_ptr0]]\n"
+ ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q5, [a_ptr1]\n"
+ ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q6, [a_ptr2]\n"
+ ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "ldr q7, [a_ptr3]\n"
+ ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q8, [%[b_ptr0]]\n"
+ ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [%[b_ptr0], #0x10]\n"
+ ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ "ldr q10, [%[b_ptr0], #0x20]\n"
+ ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+ ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+ ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+ ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+ "ldr q11, [%[b_ptr0], #0x30]\n"
+ ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+ ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+ ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+ ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+ "ldr q12, [%[b_ptr0], #0x40]\n"
+ ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+ "ldr q13, [%[b_ptr0], #0x50]\n"
+ ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+ "ldr q14, [%[b_ptr0], #0x60]\n"
+ ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+ ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+ ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+ ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+ "ldr q15, [%[b_ptr0], #0x70]\n"
+ ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+ ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+ ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+ ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+ ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+ ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+ ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+ ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+ ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+ ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+ ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+ ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+ ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+ ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+ ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+ ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+ ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+ ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+ ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+ ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+ ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+ ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+ ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+ ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+ "5:\n"
+ "str q16, [%[c_ptr0]]\n"
+ "str q17, [%[c_ptr0], #0x10]\n"
+ "str q18, [%[c_ptr0], #0x20]\n"
+ "str q19, [%[c_ptr0], #0x30]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x40\n"
+ "str q20, [c_ptr1]\n"
+ "str q21, [c_ptr1, #0x10]\n"
+ "str q22, [c_ptr1, #0x20]\n"
+ "str q23, [c_ptr1, #0x30]\n"
+ "str q24, [c_ptr2]\n"
+ "str q25, [c_ptr2, #0x10]\n"
+ "str q26, [c_ptr2, #0x20]\n"
+ "str q27, [c_ptr2, #0x30]\n"
+ "str q28, [c_ptr3]\n"
+ "str q29, [c_ptr3, #0x10]\n"
+ "str q30, [c_ptr3, #0x20]\n"
+ "str q31, [c_ptr3, #0x30]\n"
+ ".unreq a_ptr1\n"
+ ".unreq a_ptr2\n"
+ ".unreq a_ptr3\n"
+ ".unreq c_ptr1\n"
+ ".unreq c_ptr2\n"
+ ".unreq c_ptr3\n"
+ : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
+ : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+ );
+ break;
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
index 10d1069417..3c0395a337 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,15 +51,15 @@ public:
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 12;
}
- static int out_height() {
+ static unsigned int out_height() {
return 8;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
index 0c387ff6df..95e3712e84 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,15 +51,15 @@ public:
static const bool B_transpose = true;
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 16;
}
- static int out_height() {
+ static unsigned int out_height() {
return 4;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
index 1a3596511b..3d2b324314 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,15 +46,15 @@ public:
typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int);
/* Kernel blocking parameters */
- static int out_width() {
+ static unsigned int out_width() {
return 16;
}
- static int out_height() {
+ static unsigned int out_height() {
return 4;
}
- static int k_unroll() {
+ static unsigned int k_unroll() {
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
index a73bc76b5d..f5b4f4aa19 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,13 +46,26 @@ public:
* terms of this standard arrangement, so if the A matrix is in fact the
* B matrix from a GEMM call, the sense of the transpose needs to be
* reversed. */
- static const int A_interleave = 32;
- static const int A_block = 1;
- static const bool A_transpose = false;
+ static constexpr unsigned int A_interleave() {
+ return 32;
+ }
+
+ static constexpr unsigned int A_block() {
+ return 1;
+ }
+
+ static constexpr bool A_transpose() {
+ return false;
+ }
/* Kernel blocking parameters */
- static const int out_width = 32;
- static const int k_unroll = 1;
+ static constexpr unsigned int out_width() {
+ return 32;
+ }
+
+ static constexpr unsigned int k_unroll() {
+ return 1;
+ }
kern_type kernel = a64_sgemv_pretransposed;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
index 18c5c3a6dc..cbaa0cfb1b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,8 +39,13 @@ public:
typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
/* Kernel blocking parameters */
- static const int out_width = 96;
- static const int k_unroll = 1;
+ static unsigned int out_width() {
+ return 96;
+ }
+
+ static unsigned int k_unroll() {
+ return 1;
+ }
kern_type kernel=a64_sgemv_trans;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
index 2b58b110c0..76f452d963 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,17 +43,17 @@ public:
typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
/* Kernel blocking parameters */
- static int out_height()
+ static unsigned int out_height()
{
return 4;
}
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<float>() * 4;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
index 9d88b60cee..2ca4ce25e8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
@@ -41,17 +41,17 @@ public:
typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
/* Kernel blocking parameters */
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<__fp16>() * 3;
}
- static int out_height()
+ static unsigned int out_height()
{
return 8;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
index 2e8f261fe1..8c1fe6d0b6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
@@ -41,17 +41,17 @@ public:
typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
/* Kernel blocking parameters */
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<float>() * 3;
}
- static int out_height()
+ static unsigned int out_height()
{
return 8;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
index 67154e6a3f..cbb21387b1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
@@ -41,17 +41,17 @@ public:
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<int32_t>() * 3;
}
- static int out_height()
+ static unsigned int out_height()
{
return 8;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
index 628c5a868e..99c039e121 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
@@ -41,17 +41,17 @@ public:
typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
/* Kernel blocking parameters */
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<uint32_t>() * 3;
}
- static int out_height()
+ static unsigned int out_height()
{
return 8;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
index fcc80d9fe5..d7f9f20074 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,17 +42,17 @@ public:
typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
/* Kernel blocking parameters */
- static int out_height()
+ static unsigned int out_height()
{
return 4;
}
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<float>() * 4;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
index f5634e3618..8b98358cd4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,17 +42,17 @@ public:
typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int32_t, int, int, int);
/* Kernel blocking parameters */
- static int out_height()
+ static unsigned int out_height()
{
return 4;
}
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<int32_t>() * 4;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
index f5ebad8565..bcbd3d35f5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,17 +43,17 @@ public:
typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, uint32_t, int, int, int);
/* Kernel blocking parameters */
- static int out_height()
+ static unsigned int out_height()
{
return 4;
}
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<uint32_t>() * 4;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 4;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
index 80b216ca14..06622d6f2e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_fp32_mla_1VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,17 +42,17 @@ public:
typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, float, int, int, int);
/* Kernel blocking parameters */
- static int out_height()
+ static unsigned int out_height()
{
return 4;
}
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<float>() * 1;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
index aa2c522382..022efdfc26 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -42,17 +42,17 @@ public:
typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
/* Kernel blocking parameters */
- static int out_height()
+ static unsigned int out_height()
{
return 4;
}
- static int out_width()
+ static unsigned int out_width()
{
return get_vector_length<float>() * 1;
}
- static int k_unroll()
+ static unsigned int k_unroll()
{
return 1;
}
diff --git a/src/core/NEON/kernels/arm_gemm/ndrange.hpp b/src/core/NEON/kernels/arm_gemm/ndrange.hpp
new file mode 100644
index 0000000000..20824dfc8b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/ndrange.hpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+
+namespace arm_gemm {
+
+template<unsigned int D>
+class NDRange {
+private:
+ unsigned int m_sizes[D];
+ unsigned int m_totalsizes[D];
+
+ class NDRangeIterator {
+ private:
+ const NDRange &m_parent;
+ unsigned int m_pos = 0;
+ unsigned int m_end = 0;
+
+ public:
+ NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { }
+
+ bool done() const {
+ return (m_pos >= m_end);
+ }
+
+ unsigned int dim(unsigned int d) const {
+ unsigned int r = m_pos;
+
+ if (d < (D - 1)) {
+ r %= m_parent.m_totalsizes[d];
+ }
+
+ if (d > 0) {
+ r /= m_parent.m_totalsizes[d-1];
+ }
+
+ return r;
+ }
+
+ bool next_dim0() {
+ m_pos++;
+
+ return !done();
+ }
+
+ bool next_dim1() {
+ m_pos += m_parent.m_sizes[0] - dim(0);
+
+ return !done();
+ }
+
+ unsigned int dim0_max() const {
+ unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
+
+ return dim(0) + offset;
+ }
+ };
+
+public:
+ template <typename... T>
+ NDRange(T... ts) : m_sizes{ts...} {
+ unsigned int t=1;
+
+ for (unsigned int i=0; i<D; i++) {
+ t *= m_sizes[i];
+
+ m_totalsizes[i] = t;
+ }
+ }
+
+ NDRangeIterator iterator(unsigned int start, unsigned int end) const {
+ return NDRangeIterator(*this, start, end);
+ }
+
+ unsigned int total_size() const {
+ return m_totalsizes[D - 1];
+ }
+
+ unsigned int get_size(unsigned int v) const {
+ return m_sizes[v];
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 8b96c328a6..f0707800cf 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,7 +32,8 @@
// Paranoid option for the above with assert
// #define UNREACHABLE(why) assert(0 && why)
-inline int iceildiv(const int a, const int b) {
+template<typename T>
+inline T iceildiv(const T a, const T b) {
return (a + b - 1) / b;
}